pcre2_compile.c source code [Godot/thirdparty/pcre2/src/pcre2_compile.c]

1	/*************************************************
2	* Perl-Compatible Regular Expressions *
3	*************************************************/
4
5	/ PCRE is a library of functions to support regular expressions whose syntax*
6	and semantics are as close as possible to those of the Perl 5 language.
7
8	Written by Philip Hazel
9	Original API code Copyright (c) 1997-2012 University of Cambridge
10	New API code Copyright (c) 2016-2022 University of Cambridge
11
12	-----------------------------------------------------------------------------
13	Redistribution and use in source and binary forms, with or without
14	modification, are permitted provided that the following conditions are met:
15
16	* Redistributions of source code must retain the above copyright notice,
17	this list of conditions and the following disclaimer.
18
19	* Redistributions in binary form must reproduce the above copyright
20	notice, this list of conditions and the following disclaimer in the
21	documentation and/or other materials provided with the distribution.
22
23	* Neither the name of the University of Cambridge nor the names of its
24	contributors may be used to endorse or promote products derived from
25	this software without specific prior written permission.
26
27	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37	POSSIBILITY OF SUCH DAMAGE.
38	-----------------------------------------------------------------------------
39	*/
40
41
42	#ifdef HAVE_CONFIG_H
43	#include "config.h"
44	#endif
45
46	#define NLBLOCK cb /* Block containing newline information */
47	#define PSSTART start_pattern /* Field containing processed string start */
48	#define PSEND end_pattern /* Field containing processed string end */
49
50	#include "pcre2_internal.h"
51
52	/ In rare error cases debugging might require calling pcre2_printint(). /
53
54	#if 0
55	#ifdef EBCDIC
56	#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57	#else
58	#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59	#endif
60	#include "pcre2_printint.c"
61	#define DEBUG_CALL_PRINTINT
62	#endif
63
64	/ Other debugging code can be enabled by these defines. /
65
66	/ #define DEBUG_SHOW_CAPTURES /
67	/ #define DEBUG_SHOW_PARSED /
68
69	/ There are a few things that vary with different code unit sizes. Handle them*
70	by defining macros in order to minimize #if usage. /*
71
72	#if PCRE2_CODE_UNIT_WIDTH == 8
73	#define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
74	#define XDIGIT(c) xdigitab[c]
75
76	#else /* Either 16-bit or 32-bit */
77	#define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)
78
79	#if PCRE2_CODE_UNIT_WIDTH == 16
80	#define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
81
82	#else /* 32-bit */
83	#define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
84	#endif
85	#endif
86
87	/ Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which*
88	consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89	them will be able to (i.e. assume a 64-bit world). /*
90
91	#if PCRE2_SIZE_MAX <= UINT32_MAX
92	#define PUTOFFSET(s,p) *p++ = s
93	#define GETOFFSET(s,p) s = *p++
94	#define GETPLUSOFFSET(s,p) s = *(++p)
95	#define READPLUSOFFSET(s,p) s = p[1]
96	#define SKIPOFFSET(p) p++
97	#define SIZEOFFSET 1
98	#else
99	#define PUTOFFSET(s,p) \
100	{ p++ = (uint32_t)(s >> 32); p++ = (uint32_t)(s & 0xffffffff); }
101	#define GETOFFSET(s,p) \
102	{ s = ((PCRE2_SIZE)p[0] << 32) \| (PCRE2_SIZE)p[1]; p += 2; }
103	#define GETPLUSOFFSET(s,p) \
104	{ s = ((PCRE2_SIZE)p[1] << 32) \| (PCRE2_SIZE)p[2]; p += 2; }
105	#define READPLUSOFFSET(s,p) \
106	{ s = ((PCRE2_SIZE)p[1] << 32) \| (PCRE2_SIZE)p[2]; }
107	#define SKIPOFFSET(p) p += 2
108	#define SIZEOFFSET 2
109	#endif
110
111	/ Macros for manipulating elements of the parsed pattern vector. /
112
113	#define META_CODE(x) (x & 0xffff0000u)
114	#define META_DATA(x) (x & 0x0000ffffu)
115	#define META_DIFF(x,y) ((x-y)>>16)
116
117	/ Function definitions to allow mutual recursion /
118
119	#ifdef SUPPORT_UNICODE
120	static unsigned int
121	add_list_to_class_internal(uint8_t , PCRE2_UCHAR *, uint32_t,
122	compile_block , const* uint32_t , unsigned* int);
123	#endif
124
125	static int
126	compile_regex(uint32_t, PCRE2_UCHAR , uint32_t , int *, uint32_t,
127	uint32_t , uint32_t , uint32_t , uint32_t , branch_chain *,
128	compile_block , PCRE2_SIZE );
129
130	static int
131	get_branchlength(uint32_t *, int* , int* , parsed_recurse_check ,
132	compile_block *);
133
134	static BOOL
135	set_lookbehind_lengths(uint32_t *, int* , int* , parsed_recurse_check ,
136	compile_block *);
137
138	static int
139	check_lookbehinds(uint32_t , uint32_t , parsed_recurse_check ,
140	compile_block , int* *);
141
142
143	/*************************************************
144	* Code parameters and static tables *
145	*************************************************/
146
147	#define MAX_GROUP_NUMBER 65535u
148	#define MAX_REPEAT_COUNT 65535u
149	#define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1)
150
151	/ COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in*
152	different ways in the different pattern scans. The parsing and group-
153	identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
154	aligned for this. Having defined the size in code units, we set up
155	C16_WORK_SIZE as the number of elements in the 16-bit vector.
156
157	During the first compiling phase, when determining how much memory is required,
158	the regex is partly compiled into this space, but the compiled parts are
159	discarded as soon as they can be, so that hopefully there will never be an
160	overrun. The code does, however, check for an overrun, which can occur for
161	pathological patterns. The size of the workspace depends on LINK_SIZE because
162	the length of compiled items varies with this.
163
164	In the real compile phase, this workspace is not currently used. /*
165
166	#define COMPILE_WORK_SIZE (3000LINK_SIZE) / Size in code units */
167
168	#define C16_WORK_SIZE \
169	((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
170
171	/ A uint32_t vector is used for caching information about the size of*
172	capturing groups, to improve performance. A default is created on the stack of
173	this size. /*
174
175	#define GROUPINFO_DEFAULT_SIZE 256
176
177	/ The overrun tests check for a slightly smaller size so that they detect the*
178	overrun before it actually does run off the end of the data block. /*
179
180	#define WORK_SIZE_SAFETY_MARGIN (100)
181
182	/ This value determines the size of the initial vector that is used for*
183	remembering named groups during the pre-compile. It is allocated on the stack,
184	but if it is too small, it is expanded, in a similar way to the workspace. The
185	value is the number of slots in the list. /*
186
187	#define NAMED_GROUP_LIST_SIZE 20
188
189	/ The pre-compiling pass over the pattern creates a parsed pattern in a vector*
190	of uint32_t. For short patterns this lives on the stack, with this size. Heap
191	memory is used for longer patterns. /*
192
193	#define PARSED_PATTERN_DEFAULT_SIZE 1024
194
195	/ Maximum length value to check against when making sure that the variable*
196	that holds the compiled pattern length does not overflow. We make it a bit less
197	than INT_MAX to allow for adding in group terminating code units, so that we
198	don't have to check them every time. /*
199
200	#define OFLOW_MAX (INT_MAX - 20)
201
202	/ Code values for parsed patterns, which are stored in a vector of 32-bit*
203	unsigned ints. Values less than META_END are literal data values. The coding
204	for identifying the item is in the top 16-bits, leaving 16 bits for the
205	additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
206	macros are used to manipulate parsed pattern elements.
207
208	NOTE: When these definitions are changed, the table of extra lengths for each
209	code (meta_extra_lengths, just below) must be updated to remain in step. /*
210
211	#define META_END 0x80000000u /* End of pattern */
212
213	#define META_ALT 0x80010000u /* alternation */
214	#define META_ATOMIC 0x80020000u /* atomic group */
215	#define META_BACKREF 0x80030000u /* Back ref */
216	#define META_BACKREF_BYNAME 0x80040000u /* \k'name' */
217	#define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */
218	#define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */
219	#define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */
220	#define META_CAPTURE 0x80080000u /* Capturing parenthesis */
221	#define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */
222	#define META_CLASS 0x800a0000u /* start non-empty class */
223	#define META_CLASS_EMPTY 0x800b0000u /* empty class */
224	#define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */
225	#define META_CLASS_END 0x800d0000u /* end of non-empty class */
226	#define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */
227	#define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */
228	#define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */
229	#define META_COND_NAME 0x80110000u /* (?(<name>)... */
230	#define META_COND_NUMBER 0x80120000u /* (?(digits)... */
231	#define META_COND_RNAME 0x80130000u /* (?(R&name)... */
232	#define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */
233	#define META_COND_VERSION 0x80150000u /* (?(VERSION<op>x.y)... */
234	#define META_DOLLAR 0x80160000u /* $ metacharacter */
235	#define META_DOT 0x80170000u /* . metacharacter */
236	#define META_ESCAPE 0x80180000u /* \d and friends */
237	#define META_KET 0x80190000u /* closing parenthesis */
238	#define META_NOCAPTURE 0x801a0000u /* no capture parens */
239	#define META_OPTIONS 0x801b0000u /* (?i) and friends */
240	#define META_POSIX 0x801c0000u /* POSIX class item */
241	#define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */
242	#define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */
243	#define META_RANGE_LITERAL 0x801f0000u /* range defined literally */
244	#define META_RECURSE 0x80200000u /* Recursion */
245	#define META_RECURSE_BYNAME 0x80210000u /* (?&name) */
246	#define META_SCRIPT_RUN 0x80220000u /* (script_run:...) /
247
248	/ These must be kept together to make it easy to check that an assertion*
249	is present where expected in a conditional group. /*
250
251	#define META_LOOKAHEAD 0x80230000u /* (?= */
252	#define META_LOOKAHEADNOT 0x80240000u /* (?! */
253	#define META_LOOKBEHIND 0x80250000u /* (?<= */
254	#define META_LOOKBEHINDNOT 0x80260000u /* (?<! */
255
256	/ These cannot be conditions /
257
258	#define META_LOOKAHEAD_NA 0x80270000u /* (napla: /
259	#define META_LOOKBEHIND_NA 0x80280000u /* (naplb: /
260
261	/ These must be kept in this order, with consecutive values, and the _ARG*
262	versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
263	versions. /*
264
265	#define META_MARK 0x80290000u /* (MARK) /
266	#define META_ACCEPT 0x802a0000u /* (ACCEPT) /
267	#define META_FAIL 0x802b0000u /* (FAIL) /
268	#define META_COMMIT 0x802c0000u /* These */
269	#define META_COMMIT_ARG 0x802d0000u /* pairs */
270	#define META_PRUNE 0x802e0000u /* must */
271	#define META_PRUNE_ARG 0x802f0000u /* be */
272	#define META_SKIP 0x80300000u /* kept */
273	#define META_SKIP_ARG 0x80310000u /* in */
274	#define META_THEN 0x80320000u /* this */
275	#define META_THEN_ARG 0x80330000u /* order */
276
277	/ These must be kept in groups of adjacent 3 values, and all together. /
278
279	#define META_ASTERISK 0x80340000u /* * */
280	#define META_ASTERISK_PLUS 0x80350000u /* + /
281	#define META_ASTERISK_QUERY 0x80360000u /* ? /
282	#define META_PLUS 0x80370000u /* + */
283	#define META_PLUS_PLUS 0x80380000u /* ++ */
284	#define META_PLUS_QUERY 0x80390000u /* +? */
285	#define META_QUERY 0x803a0000u /* ? */
286	#define META_QUERY_PLUS 0x803b0000u /* ?+ */
287	#define META_QUERY_QUERY 0x803c0000u /* ?? */
288	#define META_MINMAX 0x803d0000u /* {n,m} repeat */
289	#define META_MINMAX_PLUS 0x803e0000u /* {n,m}+ repeat */
290	#define META_MINMAX_QUERY 0x803f0000u /* {n,m}? repeat */
291
292	#define META_FIRST_QUANTIFIER META_ASTERISK
293	#define META_LAST_QUANTIFIER META_MINMAX_QUERY
294
295	/ This is a special "meta code" that is used only to distinguish (asr: from
296	(sr: in the table of aphabetic assertions. It is never stored in the parsed*
297	pattern because (asr: is turned into (sr:(atomic: at that stage. There is*
298	therefore no need for it to have a length entry, so use a high value. /*
299
300	#define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
301
302	/ Table of extra lengths for each of the meta codes. Must be kept in step with*
303	the definitions above. For some items these values are a basic length to which
304	a variable amount has to be added. /*
305
306	static unsigned char meta_extra_lengths[] = {
307	`0`, / META_END /
308	`0`, / META_ALT /
309	`0`, / META_ATOMIC /
310	`0`, / META_BACKREF - more if group is >= 10 /
311	`1`+SIZEOFFSET, / META_BACKREF_BYNAME /
312	`1`, / META_BIGVALUE /
313	`3`, / META_CALLOUT_NUMBER /
314	`3`+SIZEOFFSET, / META_CALLOUT_STRING /
315	`0`, / META_CAPTURE /
316	`0`, / META_CIRCUMFLEX /
317	`0`, / META_CLASS /
318	`0`, / META_CLASS_EMPTY /
319	`0`, / META_CLASS_EMPTY_NOT /
320	`0`, / META_CLASS_END /
321	`0`, / META_CLASS_NOT /
322	`0`, / META_COND_ASSERT /
323	SIZEOFFSET, / META_COND_DEFINE /
324	`1`+SIZEOFFSET, / META_COND_NAME /
325	`1`+SIZEOFFSET, / META_COND_NUMBER /
326	`1`+SIZEOFFSET, / META_COND_RNAME /
327	`1`+SIZEOFFSET, / META_COND_RNUMBER /
328	`3`, / META_COND_VERSION /
329	`0`, / META_DOLLAR /
330	`0`, / META_DOT /
331	`0`, / META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k /
332	`0`, / META_KET /
333	`0`, / META_NOCAPTURE /
334	`1`, / META_OPTIONS /
335	`1`, / META_POSIX /
336	`1`, / META_POSIX_NEG /
337	`0`, / META_RANGE_ESCAPED /
338	`0`, / META_RANGE_LITERAL /
339	SIZEOFFSET, / META_RECURSE /
340	`1`+SIZEOFFSET, / META_RECURSE_BYNAME /
341	`0`, / META_SCRIPT_RUN /
342	`0`, / META_LOOKAHEAD /
343	`0`, / META_LOOKAHEADNOT /
344	SIZEOFFSET, / META_LOOKBEHIND /
345	SIZEOFFSET, / META_LOOKBEHINDNOT /
346	`0`, / META_LOOKAHEAD_NA /
347	SIZEOFFSET, / META_LOOKBEHIND_NA /
348	`1`, / META_MARK - plus the string length /
349	`0`, / META_ACCEPT /
350	`0`, / META_FAIL /
351	`0`, / META_COMMIT /
352	`1`, / META_COMMIT_ARG - plus the string length /
353	`0`, / META_PRUNE /
354	`1`, / META_PRUNE_ARG - plus the string length /
355	`0`, / META_SKIP /
356	`1`, / META_SKIP_ARG - plus the string length /
357	`0`, / META_THEN /
358	`1`, / META_THEN_ARG - plus the string length /
359	`0`, / META_ASTERISK /
360	`0`, / META_ASTERISK_PLUS /
361	`0`, / META_ASTERISK_QUERY /
362	`0`, / META_PLUS /
363	`0`, / META_PLUS_PLUS /
364	`0`, / META_PLUS_QUERY /
365	`0`, / META_QUERY /
366	`0`, / META_QUERY_PLUS /
367	`0`, / META_QUERY_QUERY /
368	`2`, / META_MINMAX /
369	`2`, / META_MINMAX_PLUS /
370	`2` / META_MINMAX_QUERY /
371	};
372
373	/ Types for skipping parts of a parsed pattern. /
374
375	enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
376
377	/ Macro for setting individual bits in class bitmaps. It took some*
378	experimenting to figure out how to stop gcc 5.3.0 from warning with
379	-Wconversion. This version gets a warning:
380
381	#define SETBIT(a,b) a[(b)/8] \|= (uint8_t)(1u << ((b)&7))
382
383	Let's hope the apparently less efficient version isn't actually so bad if the
384	compiler is clever with identical subexpressions. /*
385
386	#define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] \| (1u << ((b)&7)))
387
388	/ Values and flags for the unsigned xxcuflags variables that accompany xxcu*
389	variables, which are concerned with first and required code units. A value
390	greater than or equal to REQ_NONE means "no code unit set"; otherwise the
391	matching xxcu variable is set, and the low valued bits are relevant. /*
392
393	#define REQ_UNSET 0xffffffffu /* Not yet found anything */
394	#define REQ_NONE 0xfffffffeu /* Found not fixed character */
395	#define REQ_CASELESS 0x00000001u /* Code unit in xxcu is caseless */
396	#define REQ_VARY 0x00000002u /* Code unit is followed by non-literal */
397
398	/ These flags are used in the groupinfo vector. /
399
400	#define GI_SET_FIXED_LENGTH 0x80000000u
401	#define GI_NOT_FIXED_LENGTH 0x40000000u
402	#define GI_FIXED_LENGTH_MASK 0x0000ffffu
403
404	/ This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC*
405	and is fast (a good compiler can turn it into a subtraction and unsigned
406	comparison). /*
407
408	#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
409
410	/ Table to identify hex digits. The tables in chartables are dependent on the*
411	locale, and may mark arbitrary characters as digits. We want to recognize only
412	0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
413	costs 256 bytes, but it is a lot faster than doing character value tests (at
414	least in some simple cases I timed), and in some applications one wants PCRE2
415	to compile efficiently as well as match efficiently. The value in the table is
416	the binary hex digit value, or 0xff for non-hex digits. /*
417
418	/ This is the "normal" case, for ASCII systems, and EBCDIC systems running in*
419	UTF-8 mode. /*
420
421	#ifndef EBCDIC
422	static const uint8_t xdigitab[] =
423	{
424	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 0- 7 /
425	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 8- 15 /
426	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 16- 23 /
427	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 24- 31 /
428	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / - ' /
429	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / ( - / /
430	`0x00`,`0x01`,`0x02`,`0x03`,`0x04`,`0x05`,`0x06`,`0x07`, / 0 - 7 /
431	`0x08`,`0x09`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 8 - ? /
432	`0xff`,`0x0a`,`0x0b`,`0x0c`,`0x0d`,`0x0e`,`0x0f`,`0xff`, / @ - G /
433	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / H - O /
434	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / P - W /
435	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / X - _ /
436	`0xff`,`0x0a`,`0x0b`,`0x0c`,`0x0d`,`0x0e`,`0x0f`,`0xff`, / ` - g /
437	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / h - o /
438	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / p - w /
439	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / x -127 /
440	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 128-135 /
441	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 136-143 /
442	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 144-151 /
443	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 152-159 /
444	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 160-167 /
445	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 168-175 /
446	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 176-183 /
447	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 184-191 /
448	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 192-199 /
449	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 2ff-207 /
450	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 208-215 /
451	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 216-223 /
452	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 224-231 /
453	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 232-239 /
454	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 240-247 /
455	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`};/ 248-255 /
456
457	#else
458
459	/ This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. /
460
461	static const uint8_t xdigitab[] =
462	{
463	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 0- 7 0 /
464	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 8- 15 /
465	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 16- 23 10 /
466	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 24- 31 /
467	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 32- 39 20 /
468	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 40- 47 /
469	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 48- 55 30 /
470	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 56- 63 /
471	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / - 71 40 /
472	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 72- \| /
473	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / & - 87 50 /
474	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 88- 95 /
475	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / - -103 60 /
476	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 104- ? /
477	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 112-119 70 /
478	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 120- " /
479	`0xff`,`0x0a`,`0x0b`,`0x0c`,`0x0d`,`0x0e`,`0x0f`,`0xff`, / 128- g 80 /
480	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / h -143 /
481	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 144- p 90 /
482	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / q -159 /
483	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 160- x A0 /
484	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / y -175 /
485	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / ^ -183 B0 /
486	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / 184-191 /
487	`0xff`,`0x0a`,`0x0b`,`0x0c`,`0x0d`,`0x0e`,`0x0f`,`0xff`, / { - G C0 /
488	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / H -207 /
489	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / } - P D0 /
490	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / Q -223 /
491	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / \ - X E0 /
492	`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`, / Y -239 /
493	`0x00`,`0x01`,`0x02`,`0x03`,`0x04`,`0x05`,`0x06`,`0x07`, / 0 - 7 F0 /
494	`0x08`,`0x09`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`,`0xff`};/ 8 -255 /
495	#endif /* EBCDIC */
496
497
498	/ Table for handling alphanumeric escaped characters. Positive returns are*
499	simple data values; negative values are for special things like \d and so on.
500	Zero means further processing is needed (for things like \x), or the escape is
501	invalid. /*
502
503	/ This is the "normal" table for ASCII systems or for EBCDIC systems running*
504	in UTF-8 mode. It runs from '0' to 'z'. /*
505
506	#ifndef EBCDIC
507	#define ESCAPES_FIRST CHAR_0
508	#define ESCAPES_LAST CHAR_z
509	#define UPPER_CASE(c) (c-32)
510
511	static const short int escapes[] = {
512	`0`, `0`,
513	`0`, `0`,
514	`0`, `0`,
515	`0`, `0`,
516	`0`, `0`,
517	CHAR_COLON, CHAR_SEMICOLON,
518	CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
519	CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
520	CHAR_COMMERCIAL_AT, -ESC_A,
521	-ESC_B, -ESC_C,
522	-ESC_D, -ESC_E,
523	`0`, -ESC_G,
524	-ESC_H, `0`,
525	`0`, -ESC_K,
526	`0`, `0`,
527	-ESC_N, `0`,
528	-ESC_P, -ESC_Q,
529	-ESC_R, -ESC_S,
530	`0`, `0`,
531	-ESC_V, -ESC_W,
532	-ESC_X, `0`,
533	-ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
534	CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
535	CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
536	CHAR_GRAVE_ACCENT, CHAR_BEL,
537	-ESC_b, `0`,
538	-ESC_d, CHAR_ESC,
539	CHAR_FF, `0`,
540	-ESC_h, `0`,
541	`0`, -ESC_k,
542	`0`, `0`,
543	CHAR_LF, `0`,
544	-ESC_p, `0`,
545	CHAR_CR, -ESC_s,
546	CHAR_HT, `0`,
547	-ESC_v, -ESC_w,
548	`0`, `0`,
549	-ESC_z
550	};
551
552	#else
553
554	/ This is the "abnormal" table for EBCDIC systems without UTF-8 support.*
555	It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
556	is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
557	because it is defined as 'a', which of course picks up the ASCII value. /*
558
559	#if 'a' == 0x81 /* Check for a real EBCDIC environment */
560	#define ESCAPES_FIRST CHAR_a
561	#define ESCAPES_LAST CHAR_9
562	#define UPPER_CASE(c) (c+64)
563	#else /* Testing in an ASCII environment */
564	#define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */
565	#define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */
566	#define UPPER_CASE(c) (c-32)
567	#endif
568
569	static const short int escapes[] = {
570	/ 80 / CHAR_BEL, -ESC_b, `0`, -ESC_d, CHAR_ESC, CHAR_FF, `0`,
571	/ 88 / -ESC_h, `0`, `0`, `'{'`, `0`, `0`, `0`, `0`,
572	/ 90 / `0`, `0`, -ESC_k, `0`, `0`, CHAR_LF, `0`, -ESC_p,
573	/ 98 / `0`, CHAR_CR, `0`, `'}'`, `0`, `0`, `0`, `0`,
574	/ A0 / `0`, `'~'`, -ESC_s, CHAR_HT, `0`, -ESC_v, -ESC_w, `0`,
575	/ A8 / `0`, -ESC_z, `0`, `0`, `0`, `'['`, `0`, `0`,
576	/ B0 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
577	/ B8 / `0`, `0`, `0`, `0`, `0`, `']'`, `'='`, `'-'`,
578	/ C0 / `'{'`, -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, `0`, -ESC_G,
579	/ C8 / -ESC_H, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
580	/ D0 / `'}'`, `0`, -ESC_K, `0`, `0`, -ESC_N, `0`, -ESC_P,
581	/ D8 / -ESC_Q, -ESC_R, `0`, `0`, `0`, `0`, `0`, `0`,
582	/ E0 / `'\\'`, `0`, -ESC_S, `0`, `0`, -ESC_V, -ESC_W, -ESC_X,
583	/ E8 / `0`, -ESC_Z, `0`, `0`, `0`, `0`, `0`, `0`,
584	/ F0 / `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
585	/ F8 / `0`, `0`
586	};
587
588	/ We also need a table of characters that may follow \c in an EBCDIC*
589	environment for characters 0-31. /*
590
591	static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
592
593	#endif /* EBCDIC */
594
595
596	/ Table of special "verbs" like (PRUNE). This is a short table, so it is
597	searched linearly. Put all the names into a single string, in order to reduce
598	the number of relocations when a shared library is dynamically linked. The
599	string is built from string macros so that it works in UTF-8 mode on EBCDIC
600	platforms. /*
601
602	typedef struct verbitem {
603	unsigned int len; / Length of verb name /
604	uint32_t meta; / Base META_ code /
605	int has_arg; / Argument requirement /
606	} verbitem;
607
608	static const char verbnames[] =
609	"\0" / Empty name is a shorthand for MARK /
610	STRING_MARK0
611	STRING_ACCEPT0
612	STRING_F0
613	STRING_FAIL0
614	STRING_COMMIT0
615	STRING_PRUNE0
616	STRING_SKIP0
617	STRING_THEN;
618
619	static const verbitem verbs[] = {
620	{ `0`, META_MARK, +`1` }, / > 0 => must have an argument /
621	{ `4`, META_MARK, +`1` },
622	{ `6`, META_ACCEPT, -`1` }, / < 0 => Optional argument, convert to pre-MARK /
623	{ `1`, META_FAIL, -`1` },
624	{ `4`, META_FAIL, -`1` },
625	{ `6`, META_COMMIT, `0` },
626	{ `5`, META_PRUNE, `0` }, / Optional argument; bump META code if found /
627	{ `4`, META_SKIP, `0` },
628	{ `4`, META_THEN, `0` }
629	};
630
631	static const int verbcount = sizeof(verbs)/sizeof(verbitem);
632
633	/ Verb opcodes, indexed by their META code offset from META_MARK. /
634
635	static const uint32_t verbops[] = {
636	OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
637	OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
638
639	/ Table of "alpha assertions" like (pla:...), similar to the (VERB) table. /
640
641	typedef struct alasitem {
642	unsigned int len; / Length of name /
643	uint32_t meta; / Base META_ code /
644	} alasitem;
645
646	static const char alasnames[] =
647	STRING_pla0
648	STRING_plb0
649	STRING_napla0
650	STRING_naplb0
651	STRING_nla0
652	STRING_nlb0
653	STRING_positive_lookahead0
654	STRING_positive_lookbehind0
655	STRING_non_atomic_positive_lookahead0
656	STRING_non_atomic_positive_lookbehind0
657	STRING_negative_lookahead0
658	STRING_negative_lookbehind0
659	STRING_atomic0
660	STRING_sr0
661	STRING_asr0
662	STRING_script_run0
663	STRING_atomic_script_run;
664
665	static const alasitem alasmeta[] = {
666	{ `3`, META_LOOKAHEAD },
667	{ `3`, META_LOOKBEHIND },
668	{ `5`, META_LOOKAHEAD_NA },
669	{ `5`, META_LOOKBEHIND_NA },
670	{ `3`, META_LOOKAHEADNOT },
671	{ `3`, META_LOOKBEHINDNOT },
672	{ `18`, META_LOOKAHEAD },
673	{ `19`, META_LOOKBEHIND },
674	{ `29`, META_LOOKAHEAD_NA },
675	{ `30`, META_LOOKBEHIND_NA },
676	{ `18`, META_LOOKAHEADNOT },
677	{ `19`, META_LOOKBEHINDNOT },
678	{ `6`, META_ATOMIC },
679	{ `2`, META_SCRIPT_RUN }, / sr = script run /
680	{ `3`, META_ATOMIC_SCRIPT_RUN }, / asr = atomic script run /
681	{ `10`, META_SCRIPT_RUN }, / script run /
682	{ `17`, META_ATOMIC_SCRIPT_RUN } / atomic script run /
683	};
684
685	static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
686
687	/ Offsets from OP_STAR for case-independent and negative repeat opcodes. /
688
689	static uint32_t chartypeoffset[] = {
690	OP_STAR - OP_STAR, OP_STARI - OP_STAR,
691	OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
692
693	/ Tables of names of POSIX character classes and their lengths. The names are*
694	now all in a single string, to reduce the number of relocations when a shared
695	library is dynamically loaded. The list of lengths is terminated by a zero
696	length entry. The first three must be alpha, lower, upper, as this is assumed
697	for handling case independence. The indices for graph, print, and punct are
698	needed, so identify them. /*
699
700	static const char posix_names[] =
701	STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
702	STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
703	STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
704	STRING_word0 STRING_xdigit;
705
706	static const uint8_t posix_name_lengths[] = {
707	`5`, `5`, `5`, `5`, `5`, `5`, `5`, `5`, `5`, `5`, `5`, `5`, `4`, `6`, `0` };
708
709	#define PC_GRAPH 8
710	#define PC_PRINT 9
711	#define PC_PUNCT 10
712
713	/ Table of class bit maps for each POSIX class. Each class is formed from a*
714	base map, with an optional addition or removal of another map. Then, for some
715	classes, there is some additional tweaking: for [:blank:] the vertical space
716	characters are removed, and for [:alpha:] and [:alnum:] the underscore
717	character is removed. The triples in the table consist of the base map offset,
718	second map offset or -1 if no second map, and a non-negative value for map
719	addition or a negative value for map subtraction (if there are two maps). The
720	absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
721	remove vertical space characters, 2 => remove underscore. /*
722
723	static const int posix_class_maps[] = {
724	cbit_word, cbit_digit, -`2`, / alpha /
725	cbit_lower, -`1`, `0`, / lower /
726	cbit_upper, -`1`, `0`, / upper /
727	cbit_word, -`1`, `2`, / alnum - word without underscore /
728	cbit_print, cbit_cntrl, `0`, / ascii /
729	cbit_space, -`1`, `1`, / blank - a GNU extension /
730	cbit_cntrl, -`1`, `0`, / cntrl /
731	cbit_digit, -`1`, `0`, / digit /
732	cbit_graph, -`1`, `0`, / graph /
733	cbit_print, -`1`, `0`, / print /
734	cbit_punct, -`1`, `0`, / punct /
735	cbit_space, -`1`, `0`, / space /
736	cbit_word, -`1`, `0`, / word - a Perl extension /
737	cbit_xdigit,-`1`, `0` / xdigit /
738	};
739
740	#ifdef SUPPORT_UNICODE
741
742	/ The POSIX class Unicode property substitutes that are used in UCP mode must*
743	be in the order of the POSIX class names, defined above. /*
744
745	static int posix_substitutes[] = {
746	PT_GC, ucp_L, / alpha /
747	PT_PC, ucp_Ll, / lower /
748	PT_PC, ucp_Lu, / upper /
749	PT_ALNUM, `0`, / alnum /
750	-`1`, `0`, / ascii, treat as non-UCP /
751	-`1`, `1`, / blank, treat as \h /
752	PT_PC, ucp_Cc, / cntrl /
753	PT_PC, ucp_Nd, / digit /
754	PT_PXGRAPH, `0`, / graph /
755	PT_PXPRINT, `0`, / print /
756	PT_PXPUNCT, `0`, / punct /
757	PT_PXSPACE, `0`, / space / / Xps is POSIX space, but from 8.34 /
758	PT_WORD, `0`, / word / / Perl and POSIX space are the same /
759	-`1`, `0` / xdigit, treat as non-UCP /
760	};
761	#define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
762	#endif /* SUPPORT_UNICODE */
763
764	/ Masks for checking option settings. When PCRE2_LITERAL is set, only a subset*
765	are allowed. /*
766
767	#define PUBLIC_LITERAL_COMPILE_OPTIONS \
768	(PCRE2_ANCHORED\|PCRE2_AUTO_CALLOUT\|PCRE2_CASELESS\|PCRE2_ENDANCHORED\| \
769	PCRE2_FIRSTLINE\|PCRE2_LITERAL\|PCRE2_MATCH_INVALID_UTF\| \
770	PCRE2_NO_START_OPTIMIZE\|PCRE2_NO_UTF_CHECK\|PCRE2_USE_OFFSET_LIMIT\|PCRE2_UTF)
771
772	#define PUBLIC_COMPILE_OPTIONS \
773	(PUBLIC_LITERAL_COMPILE_OPTIONS\| \
774	PCRE2_ALLOW_EMPTY_CLASS\|PCRE2_ALT_BSUX\|PCRE2_ALT_CIRCUMFLEX\| \
775	PCRE2_ALT_VERBNAMES\|PCRE2_DOLLAR_ENDONLY\|PCRE2_DOTALL\|PCRE2_DUPNAMES\| \
776	PCRE2_EXTENDED\|PCRE2_EXTENDED_MORE\|PCRE2_MATCH_UNSET_BACKREF\| \
777	PCRE2_MULTILINE\|PCRE2_NEVER_BACKSLASH_C\|PCRE2_NEVER_UCP\| \
778	PCRE2_NEVER_UTF\|PCRE2_NO_AUTO_CAPTURE\|PCRE2_NO_AUTO_POSSESS\| \
779	PCRE2_NO_DOTSTAR_ANCHOR\|PCRE2_UCP\|PCRE2_UNGREEDY)
780
781	#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
782	(PCRE2_EXTRA_MATCH_LINE\|PCRE2_EXTRA_MATCH_WORD)
783
784	#define PUBLIC_COMPILE_EXTRA_OPTIONS \
785	(PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS\| \
786	PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES\|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL\| \
787	PCRE2_EXTRA_ESCAPED_CR_IS_LF\|PCRE2_EXTRA_ALT_BSUX\| \
788	PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK)
789
790	/ Compile time error code numbers. They are given names so that they can more*
791	easily be tracked. When a new number is added, the tables called eint1 and
792	eint2 in pcre2posix.c may need to be updated, and a new error text must be
793	added to compile_error_texts in pcre2_error.c. Also, the error codes in
794	pcre2.h.in must be updated - their values are exactly 100 greater than these
795	values. /*
796
797	enum { ERR0 = COMPILE_ERROR_BASE,
798	ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
799	ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
800	ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
801	ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
802	ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
803	ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
804	ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
805	ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
806	ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
807	ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99 };
808
809	/ This is a table of start-of-pattern options such as (UTF) and settings such
810	as (LIMIT_MATCH=nnnn) and (CRLF). For completeness and backward
811	compatibility, (UTFn) is supported in the relevant libraries, but (UTF) is
812	generic and always supported. /*
813
814	enum { PSO_OPT, / Value is an option bit /
815	PSO_FLG, / Value is a flag bit /
816	PSO_NL, / Value is a newline type /
817	PSO_BSR, / Value is a \R type /
818	PSO_LIMH, / Read integer value for heap limit /
819	PSO_LIMM, / Read integer value for match limit /
820	PSO_LIMD }; / Read integer value for depth limit /
821
822	typedef struct pso {
823	const uint8_t *name;
824	uint16_t length;
825	uint16_t type;
826	uint32_t value;
827	} pso;
828
829	/ NB: STRING_UTFn_RIGHTPAR contains the length as well /
830
831	static pso pso_list[] = {
832	{ (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF },
833	{ (uint8_t *)STRING_UTF_RIGHTPAR, `4`, PSO_OPT, PCRE2_UTF },
834	{ (uint8_t *)STRING_UCP_RIGHTPAR, `4`, PSO_OPT, PCRE2_UCP },
835	{ (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, `9`, PSO_FLG, PCRE2_NOTEMPTY_SET },
836	{ (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, `17`, PSO_FLG, PCRE2_NE_ATST_SET },
837	{ (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, `16`, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
838	{ (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, `18`, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
839	{ (uint8_t *)STRING_NO_JIT_RIGHTPAR, `7`, PSO_FLG, PCRE2_NOJIT },
840	{ (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, `13`, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
841	{ (uint8_t *)STRING_LIMIT_HEAP_EQ, `11`, PSO_LIMH, `0` },
842	{ (uint8_t *)STRING_LIMIT_MATCH_EQ, `12`, PSO_LIMM, `0` },
843	{ (uint8_t *)STRING_LIMIT_DEPTH_EQ, `12`, PSO_LIMD, `0` },
844	{ (uint8_t *)STRING_LIMIT_RECURSION_EQ, `16`, PSO_LIMD, `0` },
845	{ (uint8_t *)STRING_CR_RIGHTPAR, `3`, PSO_NL, PCRE2_NEWLINE_CR },
846	{ (uint8_t *)STRING_LF_RIGHTPAR, `3`, PSO_NL, PCRE2_NEWLINE_LF },
847	{ (uint8_t *)STRING_CRLF_RIGHTPAR, `5`, PSO_NL, PCRE2_NEWLINE_CRLF },
848	{ (uint8_t *)STRING_ANY_RIGHTPAR, `4`, PSO_NL, PCRE2_NEWLINE_ANY },
849	{ (uint8_t *)STRING_NUL_RIGHTPAR, `4`, PSO_NL, PCRE2_NEWLINE_NUL },
850	{ (uint8_t *)STRING_ANYCRLF_RIGHTPAR, `8`, PSO_NL, PCRE2_NEWLINE_ANYCRLF },
851	{ (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, `12`, PSO_BSR, PCRE2_BSR_ANYCRLF },
852	{ (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, `12`, PSO_BSR, PCRE2_BSR_UNICODE }
853	};
854
855	/ This table is used when converting repeating opcodes into possessified*
856	versions as a result of an explicit possessive quantifier such as ++. A zero
857	value means there is no possessified version - in those cases the item in
858	question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
859	because all relevant opcodes are less than that. /*
860
861	static const uint8_t opcode_possessify[] = {
862	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, / 0 - 15 /
863	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, / 16 - 31 /
864
865	`0`, / NOTI /
866	OP_POSSTAR, `0`, / STAR, MINSTAR /
867	OP_POSPLUS, `0`, / PLUS, MINPLUS /
868	OP_POSQUERY, `0`, / QUERY, MINQUERY /
869	OP_POSUPTO, `0`, / UPTO, MINUPTO /
870	`0`, / EXACT /
871	`0`, `0`, `0`, `0`, / POS{STAR,PLUS,QUERY,UPTO} /
872
873	OP_POSSTARI, `0`, / STARI, MINSTARI /
874	OP_POSPLUSI, `0`, / PLUSI, MINPLUSI /
875	OP_POSQUERYI, `0`, / QUERYI, MINQUERYI /
876	OP_POSUPTOI, `0`, / UPTOI, MINUPTOI /
877	`0`, / EXACTI /
878	`0`, `0`, `0`, `0`, / POS{STARI,PLUSI,QUERYI,UPTOI} /
879
880	OP_NOTPOSSTAR, `0`, / NOTSTAR, NOTMINSTAR /
881	OP_NOTPOSPLUS, `0`, / NOTPLUS, NOTMINPLUS /
882	OP_NOTPOSQUERY, `0`, / NOTQUERY, NOTMINQUERY /
883	OP_NOTPOSUPTO, `0`, / NOTUPTO, NOTMINUPTO /
884	`0`, / NOTEXACT /
885	`0`, `0`, `0`, `0`, / NOTPOS{STAR,PLUS,QUERY,UPTO} /
886
887	OP_NOTPOSSTARI, `0`, / NOTSTARI, NOTMINSTARI /
888	OP_NOTPOSPLUSI, `0`, / NOTPLUSI, NOTMINPLUSI /
889	OP_NOTPOSQUERYI, `0`, / NOTQUERYI, NOTMINQUERYI /
890	OP_NOTPOSUPTOI, `0`, / NOTUPTOI, NOTMINUPTOI /
891	`0`, / NOTEXACTI /
892	`0`, `0`, `0`, `0`, / NOTPOS{STARI,PLUSI,QUERYI,UPTOI} /
893
894	OP_TYPEPOSSTAR, `0`, / TYPESTAR, TYPEMINSTAR /
895	OP_TYPEPOSPLUS, `0`, / TYPEPLUS, TYPEMINPLUS /
896	OP_TYPEPOSQUERY, `0`, / TYPEQUERY, TYPEMINQUERY /
897	OP_TYPEPOSUPTO, `0`, / TYPEUPTO, TYPEMINUPTO /
898	`0`, / TYPEEXACT /
899	`0`, `0`, `0`, `0`, / TYPEPOS{STAR,PLUS,QUERY,UPTO} /
900
901	OP_CRPOSSTAR, `0`, / CRSTAR, CRMINSTAR /
902	OP_CRPOSPLUS, `0`, / CRPLUS, CRMINPLUS /
903	OP_CRPOSQUERY, `0`, / CRQUERY, CRMINQUERY /
904	OP_CRPOSRANGE, `0`, / CRRANGE, CRMINRANGE /
905	`0`, `0`, `0`, `0`, / CRPOS{STAR,PLUS,QUERY,RANGE} /
906
907	`0`, `0`, `0`, / CLASS, NCLASS, XCLASS /
908	`0`, `0`, / REF, REFI /
909	`0`, `0`, / DNREF, DNREFI /
910	`0`, `0` / RECURSE, CALLOUT /
911	};
912
913
914	#ifdef DEBUG_SHOW_PARSED
915	/*************************************************
916	* Show the parsed pattern for debugging *
917	*************************************************/
918
919	/ For debugging the pre-scan, this code, which outputs the parsed data vector,*
920	can be enabled. /*
921
922	static void show_parsed(compile_block *cb)
923	{
924	uint32_t *pptr = cb->parsed_pattern;
925
926	for (;;)
927	{
928	int max, min;
929	PCRE2_SIZE offset;
930	uint32_t i;
931	uint32_t length;
932	uint32_t meta_arg = META_DATA(*pptr);
933
934	fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
935
936	if (*pptr < META_END)
937	{
938	if (pptr > `32` && pptr < `128`) fprintf(stderr, "%c", *pptr);
939	pptr++;
940	}
941
942	else switch (META_CODE(*pptr++))
943	{
944	default:
945	fprintf(stderr, "** OOPS - unknown META value - giving up **\n");
946	return;
947
948	case META_END:
949	fprintf(stderr, "META_END\n");
950	return;
951
952	case META_CAPTURE:
953	fprintf(stderr, "META_CAPTURE %d", meta_arg);
954	break;
955
956	case META_RECURSE:
957	GETOFFSET(offset, pptr);
958	fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
959	break;
960
961	case META_BACKREF:
962	if (meta_arg < `10`)
963	offset = cb->small_ref_offset[meta_arg];
964	else
965	GETOFFSET(offset, pptr);
966	fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
967	break;
968
969	case META_ESCAPE:
970	if (meta_arg == ESC_P \|\| meta_arg == ESC_p)
971	{
972	uint32_t ptype = *pptr >> `16`;
973	uint32_t pvalue = *pptr++ & `0xffff`;
974	fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? `'P'`:`'p'`,
975	ptype, pvalue);
976	}
977	else
978	{
979	uint32_t cc;
980	/ There's just one escape we might have here that isn't negated in the*
981	escapes table. /*
982	if (meta_arg == ESC_g) cc = CHAR_g;
983	else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
984	{
985	if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
986	}
987	if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
988	fprintf(stderr, "META \\%c", cc);
989	}
990	break;
991
992	case META_MINMAX:
993	min = *pptr++;
994	max = *pptr++;
995	if (max != REPEAT_UNLIMITED)
996	fprintf(stderr, "META {%d,%d}", min, max);
997	else
998	fprintf(stderr, "META {%d,}", min);
999	break;
1000
1001	case META_MINMAX_QUERY:
1002	min = *pptr++;
1003	max = *pptr++;
1004	if (max != REPEAT_UNLIMITED)
1005	fprintf(stderr, "META {%d,%d}?", min, max);
1006	else
1007	fprintf(stderr, "META {%d,}?", min);
1008	break;
1009
1010	case META_MINMAX_PLUS:
1011	min = *pptr++;
1012	max = *pptr++;
1013	if (max != REPEAT_UNLIMITED)
1014	fprintf(stderr, "META {%d,%d}+", min, max);
1015	else
1016	fprintf(stderr, "META {%d,}+", min);
1017	break;
1018
1019	case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", pptr++); break*;
1020	case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
1021	case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
1022	case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1023	case META_DOT: fprintf(stderr, "META_DOT"); break;
1024	case META_ASTERISK: fprintf(stderr, "META "); break*;
1025	case META_ASTERISK_QUERY: fprintf(stderr, "META ?"); break*;
1026	case META_ASTERISK_PLUS: fprintf(stderr, "META +"); break*;
1027	case META_PLUS: fprintf(stderr, "META +"); break;
1028	case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1029	case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1030	case META_QUERY: fprintf(stderr, "META ?"); break;
1031	case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1032	case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1033
1034	case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1035	case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1036	case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1037	case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1038	case META_LOOKAHEAD_NA: fprintf(stderr, "META (napla:"); break*;
1039	case META_SCRIPT_RUN: fprintf(stderr, "META (sr:"); break*;
1040	case META_KET: fprintf(stderr, "META )"); break;
1041	case META_ALT: fprintf(stderr, "META \| %d", meta_arg); break;
1042
1043	case META_CLASS: fprintf(stderr, "META ["); break;
1044	case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1045	case META_CLASS_END: fprintf(stderr, "META ]"); break;
1046	case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1047	case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1048
1049	case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1050	case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1051
1052	case META_POSIX: fprintf(stderr, "META_POSIX %d", pptr++); break*;
1053	case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", pptr++); break*;
1054
1055	case META_ACCEPT: fprintf(stderr, "META (ACCEPT)"); break*;
1056	case META_FAIL: fprintf(stderr, "META (FAIL)"); break*;
1057	case META_COMMIT: fprintf(stderr, "META (COMMIT)"); break*;
1058	case META_PRUNE: fprintf(stderr, "META (PRUNE)"); break*;
1059	case META_SKIP: fprintf(stderr, "META (SKIP)"); break*;
1060	case META_THEN: fprintf(stderr, "META (THEN)"); break*;
1061
1062	case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", pptr++); break*;
1063
1064	case META_LOOKBEHIND:
1065	fprintf(stderr, "META (?<= %d offset=", meta_arg);
1066	GETOFFSET(offset, pptr);
1067	fprintf(stderr, "%zd", offset);
1068	break;
1069
1070	case META_LOOKBEHIND_NA:
1071	fprintf(stderr, "META (*naplb: %d offset=", meta_arg);
1072	GETOFFSET(offset, pptr);
1073	fprintf(stderr, "%zd", offset);
1074	break;
1075
1076	case META_LOOKBEHINDNOT:
1077	fprintf(stderr, "META (?<! %d offset=", meta_arg);
1078	GETOFFSET(offset, pptr);
1079	fprintf(stderr, "%zd", offset);
1080	break;
1081
1082	case META_CALLOUT_NUMBER:
1083	fprintf(stderr, "META (?C%d) next=%d/%d", pptr[`2`], pptr[`0`],
1084	pptr[`1`]);
1085	pptr += `3`;
1086	break;
1087
1088	case META_CALLOUT_STRING:
1089	{
1090	uint32_t patoffset = pptr++; /* Offset of next pattern item /
1091	uint32_t patlength = pptr++; /* Length of next pattern item /
1092	fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1093	GETOFFSET(offset, pptr);
1094	fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1095	}
1096	break;
1097
1098	case META_RECURSE_BYNAME:
1099	fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1100	GETOFFSET(offset, pptr);
1101	fprintf(stderr, "%zd", offset);
1102	break;
1103
1104	case META_BACKREF_BYNAME:
1105	fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1106	GETOFFSET(offset, pptr);
1107	fprintf(stderr, "%zd", offset);
1108	break;
1109
1110	case META_COND_NUMBER:
1111	fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1112	GETOFFSET(offset, pptr);
1113	fprintf(stderr, "%zd", offset);
1114	pptr++;
1115	break;
1116
1117	case META_COND_DEFINE:
1118	fprintf(stderr, "META (?(DEFINE) offset=");
1119	GETOFFSET(offset, pptr);
1120	fprintf(stderr, "%zd", offset);
1121	break;
1122
1123	case META_COND_VERSION:
1124	fprintf(stderr, "META (?(VERSION%s", (*pptr++ == `0`)? "=" : ">=");
1125	fprintf(stderr, "%d.", *pptr++);
1126	fprintf(stderr, "%d)", *pptr++);
1127	break;
1128
1129	case META_COND_NAME:
1130	fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1131	GETOFFSET(offset, pptr);
1132	fprintf(stderr, "%zd", offset);
1133	break;
1134
1135	case META_COND_RNAME:
1136	fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1137	GETOFFSET(offset, pptr);
1138	fprintf(stderr, "%zd", offset);
1139	break;
1140
1141	/ This is kept as a name, because it might be. /
1142
1143	case META_COND_RNUMBER:
1144	fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1145	GETOFFSET(offset, pptr);
1146	fprintf(stderr, "%zd", offset);
1147	break;
1148
1149	case META_MARK:
1150	fprintf(stderr, "META (*MARK:");
1151	goto SHOWARG;
1152
1153	case META_COMMIT_ARG:
1154	fprintf(stderr, "META (*COMMIT:");
1155	goto SHOWARG;
1156
1157	case META_PRUNE_ARG:
1158	fprintf(stderr, "META (*PRUNE:");
1159	goto SHOWARG;
1160
1161	case META_SKIP_ARG:
1162	fprintf(stderr, "META (*SKIP:");
1163	goto SHOWARG;
1164
1165	case META_THEN_ARG:
1166	fprintf(stderr, "META (*THEN:");
1167	SHOWARG:
1168	length = *pptr++;
1169	for (i = `0`; i < length; i++)
1170	{
1171	uint32_t cc = *pptr++;
1172	if (cc > `32` && cc < `128`) fprintf(stderr, "%c", cc);
1173	else fprintf(stderr, "\\x{%x}", cc);
1174	}
1175	fprintf(stderr, ") length=%u", length);
1176	break;
1177	}
1178	fprintf(stderr, "\n");
1179	}
1180	return;
1181	}
1182	#endif /* DEBUG_SHOW_PARSED */
1183
1184
1185
1186	/*************************************************
1187	* Copy compiled code *
1188	*************************************************/
1189
1190	/ Compiled JIT code cannot be copied, so the new compiled block has no*
1191	associated JIT data. /*
1192
1193	PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1194	pcre2_code_copy(const pcre2_code *code)
1195	{
1196	PCRE2_SIZE* ref_count;
1197	pcre2_code *newcode;
1198
1199	if (code == NULL) return NULL;
1200	newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1201	if (newcode == NULL) return NULL;
1202	memcpy(newcode, code, code->blocksize);
1203	newcode->executable_jit = NULL;
1204
1205	/ If the code is one that has been deserialized, increment the reference count*
1206	in the decoded tables. /*
1207
1208	if ((code->flags & PCRE2_DEREF_TABLES) != `0`)
1209	{
1210	ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1211	(*ref_count)++;
1212	}
1213
1214	return newcode;
1215	}
1216
1217
1218
1219	/*************************************************
1220	* Copy compiled code and character tables *
1221	*************************************************/
1222
1223	/ Compiled JIT code cannot be copied, so the new compiled block has no*
1224	associated JIT data. This version of code_copy also makes a separate copy of
1225	the character tables. /*
1226
1227	PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1228	pcre2_code_copy_with_tables(const pcre2_code *code)
1229	{
1230	PCRE2_SIZE* ref_count;
1231	pcre2_code *newcode;
1232	uint8_t *newtables;
1233
1234	if (code == NULL) return NULL;
1235	newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1236	if (newcode == NULL) return NULL;
1237	memcpy(newcode, code, code->blocksize);
1238	newcode->executable_jit = NULL;
1239
1240	newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1241	code->memctl.memory_data);
1242	if (newtables == NULL)
1243	{
1244	code->memctl.free((void *)newcode, code->memctl.memory_data);
1245	return NULL;
1246	}
1247	memcpy(newtables, code->tables, TABLES_LENGTH);
1248	ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1249	*ref_count = `1`;
1250
1251	newcode->tables = newtables;
1252	newcode->flags \|= PCRE2_DEREF_TABLES;
1253	return newcode;
1254	}
1255
1256
1257
1258	/*************************************************
1259	* Free compiled code *
1260	*************************************************/
1261
1262	PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
1263	pcre2_code_free(pcre2_code *code)
1264	{
1265	PCRE2_SIZE* ref_count;
1266
1267	if (code != NULL)
1268	{
1269	#ifdef SUPPORT_JIT
1270	if (code->executable_jit != NULL)
1271	PRIV(jit_free)(code->executable_jit, &code->memctl);
1272	#endif
1273
1274	if ((code->flags & PCRE2_DEREF_TABLES) != `0`)
1275	{
1276	/ Decoded tables belong to the codes after deserialization, and they must*
1277	be freed when there are no more references to them. The ref_count should*
1278	always be > 0. /*
1279
1280	ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1281	if (*ref_count > `0`)
1282	{
1283	(*ref_count)--;
1284	if (*ref_count == `0`)
1285	code->memctl.free((void *)code->tables, code->memctl.memory_data);
1286	}
1287	}
1288
1289	code->memctl.free(code, code->memctl.memory_data);
1290	}
1291	}
1292
1293
1294
1295	/*************************************************
1296	* Read a number, possibly signed *
1297	*************************************************/
1298
1299	/ This function is used to read numbers in the pattern. The initial pointer*
1300	must be the sign or first digit of the number. When relative values (introduced
1301	by + or -) are allowed, they are relative group numbers, and the result must be
1302	greater than zero.
1303
1304	Arguments:
1305	ptrptr points to the character pointer variable
1306	ptrend points to the end of the input string
1307	allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this
1308	max_value the largest number allowed
1309	max_error the error to give for an over-large number
1310	intptr where to put the result
1311	errcodeptr where to put an error code
1312
1313	Returns: TRUE - a number was read
1314	FALSE - errorcode == 0 => no number was found
1315	errorcode != 0 => an error occurred
1316	*/
1317
1318	static BOOL
1319	read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1320	uint32_t max_value, uint32_t max_error, int intptr, int* *errorcodeptr)
1321	{
1322	int sign = `0`;
1323	uint32_t n = `0`;
1324	PCRE2_SPTR ptr = *ptrptr;
1325	BOOL yield = FALSE;
1326
1327	*errorcodeptr = `0`;
1328
1329	if (allow_sign >= `0` && ptr < ptrend)
1330	{
1331	if (*ptr == CHAR_PLUS)
1332	{
1333	sign = +`1`;
1334	max_value -= allow_sign;
1335	ptr++;
1336	}
1337	else if (*ptr == CHAR_MINUS)
1338	{
1339	sign = -`1`;
1340	ptr++;
1341	}
1342	}
1343
1344	if (ptr >= ptrend \|\| !IS_DIGIT(ptr)) return* FALSE;
1345	while (ptr < ptrend && IS_DIGIT(*ptr))
1346	{
1347	n = n * `10` + *ptr++ - CHAR_0;
1348	if (n > max_value)
1349	{
1350	*errorcodeptr = max_error;
1351	goto EXIT;
1352	}
1353	}
1354
1355	if (allow_sign >= `0` && sign != `0`)
1356	{
1357	if (n == `0`)
1358	{
1359	errorcodeptr = ERR26; /* +0 and -0 are not allowed /
1360	goto EXIT;
1361	}
1362
1363	if (sign > `0`) n += allow_sign;
1364	else if ((int)n > allow_sign)
1365	{
1366	errorcodeptr = ERR15; /* Non-existent subpattern /
1367	goto EXIT;
1368	}
1369	else n = allow_sign + `1` - n;
1370	}
1371
1372	yield = TRUE;
1373
1374	EXIT:
1375	*intptr = n;
1376	*ptrptr = ptr;
1377	return yield;
1378	}
1379
1380
1381
1382	/*************************************************
1383	* Read repeat counts *
1384	*************************************************/
1385
1386	/ Read an item of the form {n,m} and return the values if non-NULL pointers*
1387	are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1388	larger value is used for "unlimited". We have to use signed arguments for
1389	read_number() because it is capable of returning a signed value.
1390
1391	Arguments:
1392	ptrptr points to pointer to character after'{'
1393	ptrend pointer to end of input
1394	minp if not NULL, pointer to int for min
1395	maxp if not NULL, pointer to int for max (-1 if no max)
1396	returned as -1 if no max
1397	errorcodeptr points to error code variable
1398
1399	Returns: FALSE if not a repeat quantifier, errorcode set zero
1400	FALSE on error, with errorcode set non-zero
1401	TRUE on success, with pointer updated to point after '}'
1402	*/
1403
1404	static BOOL
1405	read_repeat_counts(PCRE2_SPTR ptrptr, PCRE2_SPTR ptrend, uint32_t minp,
1406	uint32_t maxp, int* *errorcodeptr)
1407	{
1408	PCRE2_SPTR p;
1409	BOOL yield = FALSE;
1410	BOOL had_comma = FALSE;
1411	int32_t min = `0`;
1412	int32_t max = REPEAT_UNLIMITED; / This value is larger than MAX_REPEAT_COUNT /
1413
1414	/ Check the syntax /
1415
1416	*errorcodeptr = `0`;
1417	for (p = *ptrptr;; p++)
1418	{
1419	uint32_t c;
1420	if (p >= ptrend) return FALSE;
1421	c = *p;
1422	if (IS_DIGIT(c)) continue;
1423	if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1424	if (c == CHAR_COMMA)
1425	{
1426	if (had_comma) return FALSE;
1427	had_comma = TRUE;
1428	}
1429	else return FALSE;
1430	}
1431
1432	/ The only error from read_number() is for a number that is too big. /
1433
1434	p = *ptrptr;
1435	if (!read_number(&p, ptrend, -`1`, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1436	goto EXIT;
1437
1438	if (*p == CHAR_RIGHT_CURLY_BRACKET)
1439	{
1440	p++;
1441	max = min;
1442	}
1443	else
1444	{
1445	if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1446	{
1447	if (!read_number(&p, ptrend, -`1`, MAX_REPEAT_COUNT, ERR5, &max,
1448	errorcodeptr))
1449	goto EXIT;
1450	if (max < min)
1451	{
1452	*errorcodeptr = ERR4;
1453	goto EXIT;
1454	}
1455	}
1456	p++;
1457	}
1458
1459	yield = TRUE;
1460	if (minp != NULL) *minp = (uint32_t)min;
1461	if (maxp != NULL) *maxp = (uint32_t)max;
1462
1463	/ Update the pattern pointer /
1464
1465	EXIT:
1466	*ptrptr = p;
1467	return yield;
1468	}
1469
1470
1471
1472	/*************************************************
1473	* Handle escapes *
1474	*************************************************/
1475
1476	/ This function is called when a \ has been encountered. It either returns a*
1477	positive value for a simple escape such as \d, or 0 for a data character, which
1478	is placed in chptr. A backreference to group n is returned as negative n. On
1479	entry, ptr is pointing at the character after \. On exit, it points after the
1480	final code unit of the escape sequence.
1481
1482	This function is also called from pcre2_substitute() to handle escape sequences
1483	in replacement strings. In this case, the cb argument is NULL, and in the case
1484	of escapes that have further processing, only sequences that define a data
1485	character are recognised. The isclass argument is not relevant; the options
1486	argument is the final value of the compiled pattern's options.
1487
1488	Arguments:
1489	ptrptr points to the input position pointer
1490	ptrend points to the end of the input
1491	chptr points to a returned data character
1492	errorcodeptr points to the errorcode variable (containing zero)
1493	options the current options bits
1494	isclass TRUE if inside a character class
1495	cb compile data block or NULL when called from pcre2_substitute()
1496
1497	Returns: zero => a data character
1498	positive => a special escape sequence
1499	negative => a numerical back reference
1500	on error, errorcodeptr is set non-zero
1501	*/
1502
1503	int
1504	PRIV(check_escape)(PCRE2_SPTR ptrptr, PCRE2_SPTR ptrend, uint32_t chptr,
1505	int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
1506	compile_block *cb)
1507	{
1508	BOOL utf = (options & PCRE2_UTF) != `0`;
1509	PCRE2_SPTR ptr = *ptrptr;
1510	uint32_t c, cc;
1511	int escape = `0`;
1512	int i;
1513
1514	/ If backslash is at the end of the string, it's an error. /
1515
1516	if (ptr >= ptrend)
1517	{
1518	*errorcodeptr = ERR1;
1519	return `0`;
1520	}
1521
1522	GETCHARINCTEST(c, ptr); / Get character value, increment pointer /
1523	errorcodeptr = `0`; /* Be optimistic /
1524
1525	/ Non-alphanumerics are literals, so we just leave the value in c. An initial*
1526	value test saves a memory lookup for code points outside the alphanumeric
1527	range. /*
1528
1529	if (c < ESCAPES_FIRST \|\| c > ESCAPES_LAST) {} / Definitely literal /
1530
1531	/ Otherwise, do a table lookup. Non-zero values need little processing here. A*
1532	positive value is a literal value for something like \n. A negative value is
1533	the negation of one of the ESC_ macros that is passed back for handling by the
1534	calling function. Some extra checking is needed for \N because only \N{U+dddd}
1535	is supported. If the value is zero, further processing is handled below. /*
1536
1537	else if ((i = escapes[c - ESCAPES_FIRST]) != `0`)
1538	{
1539	if (i > `0`)
1540	{
1541	c = (uint32_t)i;
1542	if (c == CHAR_CR && (extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != `0`)
1543	c = CHAR_LF;
1544	}
1545	else / Negative table entry /
1546	{
1547	escape = -i; / Else return a special escape /
1548	if (cb != NULL && (escape == ESC_P \|\| escape == ESC_p \|\| escape == ESC_X))
1549	cb->external_flags \|= PCRE2_HASBKPORX; / Note \P, \p, or \X /
1550
1551	/ Perl supports \N{name} for character names and \N{U+dddd} for numerical*
1552	Unicode code points, as well as plain \N for "not newline". PCRE does not
1553	support \N{name}. However, it does support quantification such as \N{2,3},
1554	so if \N{ is not followed by U+dddd we check for a quantifier. /*
1555
1556	if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1557	{
1558	PCRE2_SPTR p = ptr + `1`;
1559
1560	/ \N{U+ can be handled by the \x{ code. However, this construction is*
1561	not valid in EBCDIC environments because it specifies a Unicode
1562	character, not a codepoint in the local code. For example \N{U+0041}
1563	must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1564	casing semantics for the entire pattern, so allow it only in UTF (i.e.
1565	Unicode) mode. /*
1566
1567	if (ptrend - p > `1` && *p == CHAR_U && p[`1`] == CHAR_PLUS)
1568	{
1569	#ifdef EBCDIC
1570	*errorcodeptr = ERR93;
1571	#else
1572	if (utf)
1573	{
1574	ptr = p + `1`;
1575	escape = `0`; / Not a fancy escape after all /
1576	goto COME_FROM_NU;
1577	}
1578	else *errorcodeptr = ERR93;
1579	#endif
1580	}
1581
1582	/ Give an error if what follows is not a quantifier, but don't override*
1583	an error set by the quantifier reader (e.g. number overflow). /*
1584
1585	else
1586	{
1587	if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1588	*errorcodeptr == `0`)
1589	*errorcodeptr = ERR37;
1590	}
1591	}
1592	}
1593	}
1594
1595	/ Escapes that need further processing, including those that are unknown, have*
1596	a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1597	\o, and \x are recognized (\u and \U can never appear as they are used for case
1598	forcing). /*
1599
1600	else
1601	{
1602	int s;
1603	PCRE2_SPTR oldptr;
1604	BOOL overflow;
1605	BOOL alt_bsux =
1606	((options & PCRE2_ALT_BSUX) \| (extra_options & PCRE2_EXTRA_ALT_BSUX)) != `0`;
1607
1608	/ Filter calls from pcre2_substitute(). /
1609
1610	if (cb == NULL)
1611	{
1612	if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1613	{
1614	*errorcodeptr = ERR3;
1615	return `0`;
1616	}
1617	alt_bsux = FALSE; / Do not modify \x handling /
1618	}
1619
1620	switch (c)
1621	{
1622	/ A number of Perl escapes are not handled by PCRE. We give an explicit*
1623	error. /*
1624
1625	case CHAR_F:
1626	case CHAR_l:
1627	case CHAR_L:
1628	*errorcodeptr = ERR37;
1629	break;
1630
1631	/ \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX*
1632	is set. Otherwise, \u must be followed by exactly four hex digits or, if
1633	PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1634	Otherwise it is a lowercase u letter. This gives some compatibility with
1635	ECMAScript (aka JavaScript). /*
1636
1637	case CHAR_u:
1638	if (!alt_bsux) errorcodeptr = ERR37; else*
1639	{
1640	uint32_t xc;
1641
1642	if (ptr >= ptrend) break;
1643	if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1644	(extra_options & PCRE2_EXTRA_ALT_BSUX) != `0`)
1645	{
1646	PCRE2_SPTR hptr = ptr + `1`;
1647	cc = `0`;
1648
1649	while (hptr < ptrend && (xc = XDIGIT(*hptr)) != `0xff`)
1650	{
1651	if ((cc & `0xf0000000`) != `0`) / Test for 32-bit overflow /
1652	{
1653	*errorcodeptr = ERR77;
1654	ptr = hptr; / Show where /
1655	break; / hptr != } will cause another break below /*
1656	}
1657	cc = (cc << `4`) \| xc;
1658	hptr++;
1659	}
1660
1661	if (hptr == ptr + `1` \|\| / No hex digits /
1662	hptr >= ptrend \|\| / Hit end of input /
1663	hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator /
1664	break; / Hex escape not recognized /
1665
1666	c = cc; / Accept the code point /
1667	ptr = hptr + `1`;
1668	}
1669
1670	else / Must be exactly 4 hex digits /
1671	{
1672	if (ptrend - ptr < `4`) break; / Less than 4 chars /
1673	if ((cc = XDIGIT(ptr[`0`])) == `0xff`) break; / Not a hex digit /
1674	if ((xc = XDIGIT(ptr[`1`])) == `0xff`) break; / Not a hex digit /
1675	cc = (cc << `4`) \| xc;
1676	if ((xc = XDIGIT(ptr[`2`])) == `0xff`) break; / Not a hex digit /
1677	cc = (cc << `4`) \| xc;
1678	if ((xc = XDIGIT(ptr[`3`])) == `0xff`) break; / Not a hex digit /
1679	c = (cc << `4`) \| xc;
1680	ptr += `4`;
1681	}
1682
1683	if (utf)
1684	{
1685	if (c > `0x10ffffU`) *errorcodeptr = ERR77;
1686	else
1687	if (c >= `0xd800` && c <= `0xdfff` &&
1688	(extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == `0`)
1689	*errorcodeptr = ERR73;
1690	}
1691	else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1692	}
1693	break;
1694
1695	/ \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,*
1696	in which case it is an upper case letter. /*
1697
1698	case CHAR_U:
1699	if (!alt_bsux) *errorcodeptr = ERR37;
1700	break;
1701
1702	/ In a character class, \g is just a literal "g". Outside a character*
1703	class, \g must be followed by one of a number of specific things:
1704
1705	(1) A number, either plain or braced. If positive, it is an absolute
1706	backreference. If negative, it is a relative backreference. This is a Perl
1707	5.10 feature.
1708
1709	(2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1710	is part of Perl's movement towards a unified syntax for back references. As
1711	this is synonymous with \k{name}, we fudge it up by pretending it really
1712	was \k{name}.
1713
1714	(3) For Oniguruma compatibility we also support \g followed by a name or a
1715	number either in angle brackets or in single quotes. However, these are
1716	(possibly recursive) subroutine calls, _not_ backreferences. We return
1717	the ESC_g code.
1718
1719	Summary: Return a negative number for a numerical back reference, ESC_k for
1720	a named back reference, and ESC_g for a named or numbered subroutine call.
1721	*/
1722
1723	case CHAR_g:
1724	if (isclass) break;
1725
1726	if (ptr >= ptrend)
1727	{
1728	*errorcodeptr = ERR57;
1729	break;
1730	}
1731
1732	if (ptr == CHAR_LESS_THAN_SIGN \|\| ptr == CHAR_APOSTROPHE)
1733	{
1734	escape = ESC_g;
1735	break;
1736	}
1737
1738	/ If there is a brace delimiter, try to read a numerical reference. If*
1739	there isn't one, assume we have a name and treat it as \k. /*
1740
1741	if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1742	{
1743	PCRE2_SPTR p = ptr + `1`;
1744	if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1745	errorcodeptr))
1746	{
1747	if (errorcodeptr == `0`) escape = ESC_k; /* No number found /
1748	break;
1749	}
1750	if (p >= ptrend \|\| *p != CHAR_RIGHT_CURLY_BRACKET)
1751	{
1752	*errorcodeptr = ERR57;
1753	break;
1754	}
1755	ptr = p + `1`;
1756	}
1757
1758	/ Read an undelimited number /
1759
1760	else
1761	{
1762	if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1763	errorcodeptr))
1764	{
1765	if (errorcodeptr == `0`) errorcodeptr = ERR57; / No number found /
1766	break;
1767	}
1768	}
1769
1770	if (s <= `0`)
1771	{
1772	*errorcodeptr = ERR15;
1773	break;
1774	}
1775
1776	escape = -s;
1777	break;
1778
1779	/ The handling of escape sequences consisting of a string of digits*
1780	starting with one that is not zero is not straightforward. Perl has changed
1781	over the years. Nowadays \g{} for backreferences and \o{} for octal are
1782	recommended to avoid the ambiguities in the old syntax.
1783
1784	Outside a character class, the digits are read as a decimal number. If the
1785	number is less than 10, or if there are that many previous extracting left
1786	brackets, it is a back reference. Otherwise, up to three octal digits are
1787	read to form an escaped character code. Thus \123 is likely to be octal 123
1788	(cf \0123, which is octal 012 followed by the literal 3).
1789
1790	Inside a character class, \ followed by a digit is always either a literal
1791	8 or 9 or an octal number. /*
1792
1793	case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1794	case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1795
1796	if (!isclass)
1797	{
1798	oldptr = ptr;
1799	ptr--; / Back to the digit /
1800
1801	/ As we know we are at a digit, the only possible error from*
1802	read_number() is a number that is too large to be a group number. In this
1803	case we fall through handle this as not a group reference. If we have
1804	read a small enough number, check for a back reference.
1805
1806	\1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1807	are octal escapes if there are not that many previous captures. /*
1808
1809	if (read_number(&ptr, ptrend, -`1`, INT_MAX/`10` - `1`, `0`, &s, errorcodeptr) &&
1810	(s < `10` \|\| oldptr[-`1`] >= CHAR_8 \|\| s <= (int)cb->bracount))
1811	{
1812	if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1813	else escape = -s; / Indicates a back reference /
1814	break;
1815	}
1816
1817	ptr = oldptr; / Put the pointer back and fall through /
1818	}
1819
1820	/ Handle a digit following \ when the number is not a back reference, or*
1821	we are within a character class. If the first digit is 8 or 9, Perl used to
1822	generate a binary zero and then treat the digit as a following literal. At
1823	least by Perl 5.18 this changed so as not to insert the binary zero. /*
1824
1825	if (c >= CHAR_8) break;
1826
1827	/ Fall through /
1828
1829	/ \0 always starts an octal number, but we may drop through to here with a*
1830	larger first octal digit. The original code used just to take the least
1831	significant 8 bits of octal numbers (I think this is what early Perls used
1832	to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1833	but no more than 3 octal digits. /*
1834
1835	case CHAR_0:
1836	c -= CHAR_0;
1837	while(i++ < `2` && ptr < ptrend && ptr >= CHAR_0 && ptr <= CHAR_7)
1838	c = c * `8` + *ptr++ - CHAR_0;
1839	#if PCRE2_CODE_UNIT_WIDTH == 8
1840	if (!utf && c > `0xff`) *errorcodeptr = ERR51;
1841	#endif
1842	break;
1843
1844	/ \o is a relatively new Perl feature, supporting a more general way of*
1845	specifying character codes in octal. The only supported form is \o{ddd}. /*
1846
1847	case CHAR_o:
1848	if (ptr >= ptrend \|\| *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1849	{
1850	ptr--;
1851	*errorcodeptr = ERR55;
1852	}
1853	else if (ptr >= ptrend \|\| *ptr == CHAR_RIGHT_CURLY_BRACKET)
1854	*errorcodeptr = ERR78;
1855	else
1856	{
1857	c = `0`;
1858	overflow = FALSE;
1859	while (ptr < ptrend && ptr >= CHAR_0 && ptr <= CHAR_7)
1860	{
1861	cc = *ptr++;
1862	if (c == `0` && cc == CHAR_0) continue; / Leading zeroes /
1863	#if PCRE2_CODE_UNIT_WIDTH == 32
1864	if (c >= `0x20000000l`) { overflow = TRUE; break; }
1865	#endif
1866	c = (c << `3`) + (cc - CHAR_0);
1867	#if PCRE2_CODE_UNIT_WIDTH == 8
1868	if (c > (utf ? `0x10ffffU` : `0xffU`)) { overflow = TRUE; break; }
1869	#elif PCRE2_CODE_UNIT_WIDTH == 16
1870	if (c > (utf ? `0x10ffffU` : `0xffffU`)) { overflow = TRUE; break; }
1871	#elif PCRE2_CODE_UNIT_WIDTH == 32
1872	if (utf && c > `0x10ffffU`) { overflow = TRUE; break; }
1873	#endif
1874	}
1875	if (overflow)
1876	{
1877	while (ptr < ptrend && ptr >= CHAR_0 && ptr <= CHAR_7) ptr++;
1878	*errorcodeptr = ERR34;
1879	}
1880	else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1881	{
1882	if (utf && c >= `0xd800` && c <= `0xdfff` &&
1883	(extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == `0`)
1884	{
1885	ptr--;
1886	*errorcodeptr = ERR73;
1887	}
1888	}
1889	else
1890	{
1891	ptr--;
1892	*errorcodeptr = ERR64;
1893	}
1894	}
1895	break;
1896
1897	/ When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed*
1898	by two hexadecimal digits. Otherwise it is a lowercase x letter. /*
1899
1900	case CHAR_x:
1901	if (alt_bsux)
1902	{
1903	uint32_t xc;
1904	if (ptrend - ptr < `2`) break; / Less than 2 characters /
1905	if ((cc = XDIGIT(ptr[`0`])) == `0xff`) break; / Not a hex digit /
1906	if ((xc = XDIGIT(ptr[`1`])) == `0xff`) break; / Not a hex digit /
1907	c = (cc << `4`) \| xc;
1908	ptr += `2`;
1909	}
1910
1911	/ Handle \x in Perl's style. \x{ddd} is a character code which can be*
1912	greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1913	digits. If not, { used to be treated as a data character. However, Perl
1914	seems to read hex digits up to the first non-such, and ignore the rest, so
1915	that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1916	now gives an error. /*
1917
1918	else
1919	{
1920	if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1921	{
1922	#ifndef EBCDIC
1923	COME_FROM_NU:
1924	#endif
1925	if (++ptr >= ptrend \|\| *ptr == CHAR_RIGHT_CURLY_BRACKET)
1926	{
1927	*errorcodeptr = ERR78;
1928	break;
1929	}
1930	c = `0`;
1931	overflow = FALSE;
1932
1933	while (ptr < ptrend && (cc = XDIGIT(*ptr)) != `0xff`)
1934	{
1935	ptr++;
1936	if (c == `0` && cc == `0`) continue; / Leading zeroes /
1937	#if PCRE2_CODE_UNIT_WIDTH == 32
1938	if (c >= `0x10000000l`) { overflow = TRUE; break; }
1939	#endif
1940	c = (c << `4`) \| cc;
1941	if ((utf && c > `0x10ffffU`) \|\| (!utf && c > MAX_NON_UTF_CHAR))
1942	{
1943	overflow = TRUE;
1944	break;
1945	}
1946	}
1947
1948	if (overflow)
1949	{
1950	while (ptr < ptrend && XDIGIT(*ptr) != `0xff`) ptr++;
1951	*errorcodeptr = ERR34;
1952	}
1953	else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1954	{
1955	if (utf && c >= `0xd800` && c <= `0xdfff` &&
1956	(extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == `0`)
1957	{
1958	ptr--;
1959	*errorcodeptr = ERR73;
1960	}
1961	}
1962
1963	/ If the sequence of hex digits does not end with '}', give an error.*
1964	We used just to recognize this construct and fall through to the normal
1965	\x handling, but nowadays Perl gives an error, which seems much more
1966	sensible, so we do too. /*
1967
1968	else
1969	{
1970	ptr--;
1971	*errorcodeptr = ERR67;
1972	}
1973	} / End of \x{} processing /
1974
1975	/ Read a up to two hex digits after \x /
1976
1977	else
1978	{
1979	c = `0`;
1980	if (ptr >= ptrend \|\| (cc = XDIGIT(ptr)) == `0xff`) break; /* Not a hex digit /
1981	ptr++;
1982	c = cc;
1983	if (ptr >= ptrend \|\| (cc = XDIGIT(ptr)) == `0xff`) break; /* Not a hex digit /
1984	ptr++;
1985	c = (c << `4`) \| cc;
1986	} / End of \xdd handling /
1987	} / End of Perl-style \x handling /
1988	break;
1989
1990	/ The handling of \c is different in ASCII and EBCDIC environments. In an*
1991	ASCII (or Unicode) environment, an error is given if the character
1992	following \c is not a printable ASCII character. Otherwise, the following
1993	character is upper-cased if it is a letter, and after that the 0x40 bit is
1994	flipped. The result is the value of the escape.
1995
1996	In an EBCDIC environment the handling of \c is compatible with the
1997	specification in the perlebcdic document. The following character must be
1998	a letter or one of small number of special characters. These provide a
1999	means of defining the character values 0-31.
2000
2001	For testing the EBCDIC handling of \c in an ASCII environment, recognize
2002	the EBCDIC value of 'c' explicitly. /*
2003
2004	#if defined EBCDIC && 'a' != 0x81
2005	case `0x83`:
2006	#else
2007	case CHAR_c:
2008	#endif
2009	if (ptr >= ptrend)
2010	{
2011	*errorcodeptr = ERR2;
2012	break;
2013	}
2014	c = *ptr;
2015	if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2016
2017	/ Handle \c in an ASCII/Unicode environment. /
2018
2019	#ifndef EBCDIC /* ASCII/UTF-8 coding */
2020	if (c < `32` \|\| c > `126`) / Excludes all non-printable ASCII /
2021	{
2022	*errorcodeptr = ERR68;
2023	break;
2024	}
2025	c ^= `0x40`;
2026
2027	/ Handle \c in an EBCDIC environment. The special case \c? is converted to*
2028	255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2029	POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2030	The other valid sequences correspond to a list of specific characters. /*
2031
2032	#else
2033	if (c == CHAR_QUESTION_MARK)
2034	c = (`'\\'` == `188` && '`' == `74`)? `0x5f` : `0xff`;
2035	else
2036	{
2037	for (i = `0`; i < `32`; i++)
2038	{
2039	if (c == ebcdic_escape_c[i]) break;
2040	}
2041	if (i < `32`) c = i; else *errorcodeptr = ERR68;
2042	}
2043	#endif /* EBCDIC */
2044
2045	ptr++;
2046	break;
2047
2048	/ Any other alphanumeric following \ is an error. Perl gives an error only*
2049	if in warning mode, but PCRE doesn't have a warning mode. /*
2050
2051	default:
2052	*errorcodeptr = ERR3;
2053	ptrptr = ptr - `1`; /* Point to the character at fault /
2054	return `0`;
2055	}
2056	}
2057
2058	/ Set the pointer to the next character before returning. /
2059
2060	*ptrptr = ptr;
2061	*chptr = c;
2062	return escape;
2063	}
2064
2065
2066
2067	#ifdef SUPPORT_UNICODE
2068	/*************************************************
2069	* Handle \P and \p *
2070	*************************************************/
2071
2072	/ This function is called after \P or \p has been encountered, provided that*
2073	PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2074	contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2075	after the final code unit of the escape sequence.
2076
2077	Arguments:
2078	ptrptr the pattern position pointer
2079	negptr a boolean that is set TRUE for negation else FALSE
2080	ptypeptr an unsigned int that is set to the type value
2081	pdataptr an unsigned int that is set to the detailed property value
2082	errorcodeptr the error code variable
2083	cb the compile data
2084
2085	Returns: TRUE if the type value was found, or FALSE for an invalid type
2086	*/
2087
2088	static BOOL
2089	get_ucp(PCRE2_SPTR ptrptr, BOOL negptr, uint16_t *ptypeptr,
2090	uint16_t pdataptr, int* errorcodeptr, compile_block cb)
2091	{
2092	PCRE2_UCHAR c;
2093	PCRE2_SIZE i, bot, top;
2094	PCRE2_SPTR ptr = *ptrptr;
2095	PCRE2_UCHAR name[`50`];
2096	PCRE2_UCHAR *vptr = NULL;
2097	uint16_t ptscript = PT_NOTSCRIPT;
2098
2099	if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2100	c = *ptr++;
2101	*negptr = FALSE;
2102
2103	/ \P or \p can be followed by a name in {}, optionally preceded by ^ for*
2104	negation. /*
2105
2106	if (c == CHAR_LEFT_CURLY_BRACKET)
2107	{
2108	if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2109
2110	if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2111	{
2112	*negptr = TRUE;
2113	ptr++;
2114	}
2115
2116	for (i = `0`; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - `1`; i++)
2117	{
2118	if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2119	c = *ptr++;
2120	while (c == `'_'` \|\| c == `'-'` \|\| isspace(c))
2121	{
2122	if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2123	c = *ptr++;
2124	}
2125	if (c == CHAR_NUL) goto ERROR_RETURN;
2126	if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2127	name[i] = tolower(c);
2128	if ((c == `':'` \|\| c == `'='`) && vptr == NULL) vptr = name + i;
2129	}
2130
2131	if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2132	name[i] = `0`;
2133	}
2134
2135	/ If { doesn't follow \p or \P there is just one following character, which*
2136	must be an ASCII letter. /*
2137
2138	else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != `0`)
2139	{
2140	name[`0`] = tolower(c);
2141	name[`1`] = `0`;
2142	}
2143	else goto ERROR_RETURN;
2144
2145	*ptrptr = ptr;
2146
2147	/ If the property contains ':' or '=' we have class name and value separately*
2148	specified. The following are supported:
2149
2150	. Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2151	. Script (synonym sc) for which the property name is the script name
2152	. Script_Extensions (synonym scx), ditto
2153
2154	As this is a small number, we currently just check the names directly. If this
2155	grows, a sorted table and a switch will be neater.
2156
2157	For both the script properties, set a PT_xxx value so that (1) they can be
2158	distinguished and (2) invalid script names that happen to be the name of
2159	another property can be diagnosed. /*
2160
2161	if (vptr != NULL)
2162	{
2163	int offset = `0`;
2164	PCRE2_UCHAR sname[`8`];
2165
2166	vptr = `0`; /* Terminate property name /
2167	if (PRIV(strcmp_c8)(name, STRING_bidiclass) == `0` \|\|
2168	PRIV(strcmp_c8)(name, STRING_bc) == `0`)
2169	{
2170	offset = `4`;
2171	sname[`0`] = CHAR_b;
2172	sname[`1`] = CHAR_i; / There is no strcpy_c8 function /
2173	sname[`2`] = CHAR_d;
2174	sname[`3`] = CHAR_i;
2175	}
2176
2177	else if (PRIV(strcmp_c8)(name, STRING_script) == `0` \|\|
2178	PRIV(strcmp_c8)(name, STRING_sc) == `0`)
2179	ptscript = PT_SC;
2180
2181	else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == `0` \|\|
2182	PRIV(strcmp_c8)(name, STRING_scx) == `0`)
2183	ptscript = PT_SCX;
2184
2185	else
2186	{
2187	*errorcodeptr = ERR47;
2188	return FALSE;
2189	}
2190
2191	/ Adjust the string in name[] as needed /
2192
2193	memmove(name + offset, vptr + `1`, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2194	if (offset != `0`) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2195	}
2196
2197	/ Search for a recognized property using binary chop. /
2198
2199	bot = `0`;
2200	top = PRIV(utt_size);
2201
2202	while (bot < top)
2203	{
2204	int r;
2205	i = (bot + top) >> `1`;
2206	r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2207
2208	/ When a matching property is found, some extra checking is needed when the*
2209	\p{xx:yy} syntax is used and xx is either sc or scx. /*
2210
2211	if (r == `0`)
2212	{
2213	*pdataptr = PRIV(utt)[i].value;
2214	if (vptr == NULL \|\| ptscript == PT_NOTSCRIPT)
2215	{
2216	*ptypeptr = PRIV(utt)[i].type;
2217	return TRUE;
2218	}
2219
2220	switch (PRIV(utt)[i].type)
2221	{
2222	case PT_SC:
2223	*ptypeptr = PT_SC;
2224	return TRUE;
2225
2226	case PT_SCX:
2227	*ptypeptr = ptscript;
2228	return TRUE;
2229	}
2230
2231	break; / Non-script found /
2232	}
2233
2234	if (r > `0`) bot = i + `1`; else top = i;
2235	}
2236
2237	errorcodeptr = ERR47; /* Unrecognized property /
2238	return FALSE;
2239
2240	ERROR_RETURN: / Malformed \P or \p /
2241	*errorcodeptr = ERR46;
2242	*ptrptr = ptr;
2243	return FALSE;
2244	}
2245	#endif
2246
2247
2248
2249	/*************************************************
2250	* Check for POSIX class syntax *
2251	*************************************************/
2252
2253	/ This function is called when the sequence "[:" or "[." or "[=" is*
2254	encountered in a character class. It checks whether this is followed by a
2255	sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2256	reach an unescaped ']' without the special preceding character, return FALSE.
2257
2258	Originally, this function only recognized a sequence of letters between the
2259	terminators, but it seems that Perl recognizes any sequence of characters,
2260	though of course unknown POSIX names are subsequently rejected. Perl gives an
2261	"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2262	didn't consider this to be a POSIX class. Likewise for [:1234:].
2263
2264	The problem in trying to be exactly like Perl is in the handling of escapes. We
2265	have to be sure that [abc[:x\]pqr] is not* treated as containing a POSIX*
2266	class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2267	below handles the special cases \\ and \], but does not try to do any other
2268	escape processing. This makes it different from Perl for cases such as
2269	[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2270	not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2271	when Perl does, I think.
2272
2273	A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2274	It seems that the appearance of a nested POSIX class supersedes an apparent
2275	external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2276	a digit. This is handled by returning FALSE if the start of a new group with
2277	the same terminator is encountered, since the next closing sequence must close
2278	the nested group, not the outer one.
2279
2280	In Perl, unescaped square brackets may also appear as part of class names. For
2281	example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2282	[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2283	seem right at all. PCRE does not allow closing square brackets in POSIX class
2284	names.
2285
2286	Arguments:
2287	ptr pointer to the character after the initial [ (colon, dot, equals)
2288	ptrend pointer to the end of the pattern
2289	endptr where to return a pointer to the terminating ':', '.', or '='
2290
2291	Returns: TRUE or FALSE
2292	*/
2293
2294	static BOOL
2295	check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2296	{
2297	PCRE2_UCHAR terminator; / Don't combine these lines; the Solaris cc /
2298	terminator = ptr++; /* compiler warns about "non-constant" initializer. /
2299
2300	for (; ptrend - ptr >= `2`; ptr++)
2301	{
2302	if (*ptr == CHAR_BACKSLASH &&
2303	(ptr[`1`] == CHAR_RIGHT_SQUARE_BRACKET \|\| ptr[`1`] == CHAR_BACKSLASH))
2304	ptr++;
2305
2306	else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[`1`] == terminator) \|\|
2307	ptr == CHAR_RIGHT_SQUARE_BRACKET) return* FALSE;
2308
2309	else if (*ptr == terminator && ptr[`1`] == CHAR_RIGHT_SQUARE_BRACKET)
2310	{
2311	*endptr = ptr;
2312	return TRUE;
2313	}
2314	}
2315
2316	return FALSE;
2317	}
2318
2319
2320
2321	/*************************************************
2322	* Check POSIX class name *
2323	*************************************************/
2324
2325	/ This function is called to check the name given in a POSIX-style class entry*
2326	such as [:alnum:].
2327
2328	Arguments:
2329	ptr points to the first letter
2330	len the length of the name
2331
2332	Returns: a value representing the name, or -1 if unknown
2333	*/
2334
2335	static int
2336	check_posix_name(PCRE2_SPTR ptr, int len)
2337	{
2338	const char *pn = posix_names;
2339	int yield = `0`;
2340	while (posix_name_lengths[yield] != `0`)
2341	{
2342	if (len == posix_name_lengths[yield] &&
2343	PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == `0`) return yield;
2344	pn += posix_name_lengths[yield] + `1`;
2345	yield++;
2346	}
2347	return -`1`;
2348	}
2349
2350
2351
2352	/*************************************************
2353	* Read a subpattern or VERB name *
2354	*************************************************/
2355
2356	/ This function is called from parse_regex() below whenever it needs to read*
2357	the name of a subpattern or a (VERB) or an (alpha_assertion). The initial
2358	pointer must be to the character before the name. If that character is '' we*
2359	are reading a verb or alpha assertion name. The pointer is updated to point
2360	after the name, for a VERB or alpha assertion name, or after tha name's
2361	terminator for a subpattern name. Returning both the offset and the name
2362	pointer is redundant information, but some callers use one and some the other,
2363	so it is simplest just to return both.
2364
2365	Arguments:
2366	ptrptr points to the character pointer variable
2367	ptrend points to the end of the input string
2368	utf true if the input is UTF-encoded
2369	terminator the terminator of a subpattern name must be this
2370	offsetptr where to put the offset from the start of the pattern
2371	nameptr where to put a pointer to the name in the input
2372	namelenptr where to put the length of the name
2373	errcodeptr where to put an error code
2374	cb pointer to the compile data block
2375
2376	Returns: TRUE if a name was read
2377	FALSE otherwise, with error code set
2378	*/
2379
2380	static BOOL
2381	read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2382	PCRE2_SIZE offsetptr, PCRE2_SPTR nameptr, uint32_t *namelenptr,
2383	int errorcodeptr, compile_block cb)
2384	{
2385	PCRE2_SPTR ptr = *ptrptr;
2386	BOOL is_group = (*ptr != CHAR_ASTERISK);
2387
2388	if (++ptr >= ptrend) / No characters in name /
2389	{
2390	errorcodeptr = is_group? ERR62: /* Subpattern name expected /
2391	ERR60; / Verb not recognized or malformed /
2392	goto FAILED;
2393	}
2394
2395	*nameptr = ptr;
2396	*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2397
2398	/ In UTF mode, a group name may contain letters and decimal digits as defined*
2399	by Unicode properties, and underscores, but must not start with a digit. /*
2400
2401	#ifdef SUPPORT_UNICODE
2402	if (utf && is_group)
2403	{
2404	uint32_t c, type;
2405
2406	GETCHAR(c, ptr);
2407	type = UCD_CHARTYPE(c);
2408
2409	if (type == ucp_Nd)
2410	{
2411	*errorcodeptr = ERR44;
2412	goto FAILED;
2413	}
2414
2415	for(;;)
2416	{
2417	if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2418	c != CHAR_UNDERSCORE) break;
2419	ptr++;
2420	FORWARDCHARTEST(ptr, ptrend);
2421	if (ptr >= ptrend) break;
2422	GETCHAR(c, ptr);
2423	type = UCD_CHARTYPE(c);
2424	}
2425	}
2426	else
2427	#else
2428	(void)utf; / Avoid compiler warning /
2429	#endif /* SUPPORT_UNICODE */
2430
2431	/ Handle non-group names and group names in non-UTF modes. A group name must*
2432	not start with a digit. If either of the others start with a digit it just
2433	won't be recognized. /*
2434
2435	{
2436	if (is_group && IS_DIGIT(*ptr))
2437	{
2438	*errorcodeptr = ERR44;
2439	goto FAILED;
2440	}
2441
2442	while (ptr < ptrend && MAX_255(ptr) && (cb->ctypes[ptr] & ctype_word) != `0`)
2443	{
2444	ptr++;
2445	}
2446	}
2447
2448	/ Check name length /
2449
2450	if (ptr > *nameptr + MAX_NAME_SIZE)
2451	{
2452	*errorcodeptr = ERR48;
2453	goto FAILED;
2454	}
2455	namelenptr = (uint32_t)(ptr - nameptr);
2456
2457	/ Subpattern names must not be empty, and their terminator is checked here.*
2458	(What follows a verb or alpha assertion name is checked separately.) /*
2459
2460	if (is_group)
2461	{
2462	if (ptr == *nameptr)
2463	{
2464	errorcodeptr = ERR62; /* Subpattern name expected /
2465	goto FAILED;
2466	}
2467	if (ptr >= ptrend \|\| *ptr != (PCRE2_UCHAR)terminator)
2468	{
2469	*errorcodeptr = ERR42;
2470	goto FAILED;
2471	}
2472	ptr++;
2473	}
2474
2475	*ptrptr = ptr;
2476	return TRUE;
2477
2478	FAILED:
2479	*ptrptr = ptr;
2480	return FALSE;
2481	}
2482
2483
2484
2485	/*************************************************
2486	* Manage callouts at start of cycle *
2487	*************************************************/
2488
2489	/ At the start of a new item in parse_regex() we are able to record the*
2490	details of the previous item in a prior callout, and also to set up an
2491	automatic callout if enabled. Avoid having two adjacent automatic callouts,
2492	which would otherwise happen for items such as \Q that contribute nothing to
2493	the parsed pattern.
2494
2495	Arguments:
2496	ptr current pattern pointer
2497	pcalloutptr points to a pointer to previous callout, or NULL
2498	auto_callout TRUE if auto_callouts are enabled
2499	parsed_pattern the parsed pattern pointer
2500	cb compile block
2501
2502	Returns: possibly updated parsed_pattern pointer.
2503	*/
2504
2505	static uint32_t *
2506	manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2507	uint32_t parsed_pattern, compile_block cb)
2508	{
2509	uint32_t previous_callout = pcalloutptr;
2510
2511	if (previous_callout != NULL) previous_callout[`2`] = (uint32_t)(ptr -
2512	cb->start_pattern - (PCRE2_SIZE)previous_callout[`1`]);
2513
2514	if (!auto_callout) previous_callout = NULL; else
2515	{
2516	if (previous_callout == NULL \|\|
2517	previous_callout != parsed_pattern - `4` \|\|
2518	previous_callout[`3`] != `255`)
2519	{
2520	previous_callout = parsed_pattern; / Set up new automatic callout /
2521	parsed_pattern += `4`;
2522	previous_callout[`0`] = META_CALLOUT_NUMBER;
2523	previous_callout[`2`] = `0`;
2524	previous_callout[`3`] = `255`;
2525	}
2526	previous_callout[`1`] = (uint32_t)(ptr - cb->start_pattern);
2527	}
2528
2529	*pcalloutptr = previous_callout;
2530	return parsed_pattern;
2531	}
2532
2533
2534
2535	/*************************************************
2536	* Parse regex and identify named groups *
2537	*************************************************/
2538
2539	/ This function is called first of all. It scans the pattern and does two*
2540	things: (1) It identifies capturing groups and makes a table of named capturing
2541	groups so that information about them is fully available to both the compiling
2542	scans. (2) It writes a parsed version of the pattern with comments omitted and
2543	escapes processed into the parsed_pattern vector.
2544
2545	Arguments:
2546	ptr points to the start of the pattern
2547	options compiling dynamic options (may change during the scan)
2548	has_lookbehind points to a boolean, set TRUE if a lookbehind is found
2549	cb pointer to the compile data block
2550
2551	Returns: zero on success or a non-zero error code, with the
2552	error offset placed in the cb field
2553	*/
2554
2555	/ A structure and some flags for dealing with nested groups. /
2556
2557	typedef struct nest_save {
2558	uint16_t nest_depth;
2559	uint16_t reset_group;
2560	uint16_t max_group;
2561	uint16_t flags;
2562	uint32_t options;
2563	} nest_save;
2564
2565	#define NSF_RESET 0x0001u
2566	#define NSF_CONDASSERT 0x0002u
2567	#define NSF_ATOMICSR 0x0004u
2568
2569	/ Options that are changeable within the pattern must be tracked during*
2570	parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2571	but all must be tracked so that META_OPTIONS items set the correct values for
2572	the main compiling phase. /*
2573
2574	#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS\|PCRE2_DOTALL\|PCRE2_DUPNAMES\| \
2575	PCRE2_EXTENDED\|PCRE2_EXTENDED_MORE\|PCRE2_MULTILINE\|PCRE2_NO_AUTO_CAPTURE\| \
2576	PCRE2_UNGREEDY)
2577
2578	/ States used for analyzing ranges in character classes. The two OK values*
2579	must be last. /*
2580
2581	enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2582
2583	/ Only in 32-bit mode can there be literals > META_END. A macro encapsulates*
2584	the storing of literal values in the main parsed pattern, where they can always
2585	be quantified. /*
2586
2587	#if PCRE2_CODE_UNIT_WIDTH == 32
2588	#define PARSED_LITERAL(c, p) \
2589	{ \
2590	if (c >= META_END) *p++ = META_BIGVALUE; \
2591	*p++ = c; \
2592	okquantifier = TRUE; \
2593	}
2594	#else
2595	#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2596	#endif
2597
2598	/ Here's the actual function. /
2599
2600	static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2601	compile_block *cb)
2602	{
2603	uint32_t c;
2604	uint32_t delimiter;
2605	uint32_t namelen;
2606	uint32_t class_range_state;
2607	uint32_t verblengthptr = NULL; /* Value avoids compiler warning /
2608	uint32_t *verbstartptr = NULL;
2609	uint32_t *previous_callout = NULL;
2610	uint32_t *parsed_pattern = cb->parsed_pattern;
2611	uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2612	uint32_t meta_quantifier = `0`;
2613	uint32_t add_after_mark = `0`;
2614	uint32_t extra_options = cb->cx->extra_options;
2615	uint16_t nest_depth = `0`;
2616	int after_manual_callout = `0`;
2617	int expect_cond_assert = `0`;
2618	int errorcode = `0`;
2619	int escape;
2620	int i;
2621	BOOL inescq = FALSE;
2622	BOOL inverbname = FALSE;
2623	BOOL utf = (options & PCRE2_UTF) != `0`;
2624	BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != `0`;
2625	BOOL isdupname;
2626	BOOL negate_class;
2627	BOOL okquantifier = FALSE;
2628	PCRE2_SPTR thisptr;
2629	PCRE2_SPTR name;
2630	PCRE2_SPTR ptrend = cb->end_pattern;
2631	PCRE2_SPTR verbnamestart = NULL; / Value avoids compiler warning /
2632	named_group *ng;
2633	nest_save top_nest, end_nests;
2634
2635	/ Insert leading items for word and line matching (features provided for the*
2636	benefit of pcre2grep). /*
2637
2638	if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != `0`)
2639	{
2640	*parsed_pattern++ = META_CIRCUMFLEX;
2641	*parsed_pattern++ = META_NOCAPTURE;
2642	}
2643	else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != `0`)
2644	{
2645	*parsed_pattern++ = META_ESCAPE + ESC_b;
2646	*parsed_pattern++ = META_NOCAPTURE;
2647	}
2648
2649	/ If the pattern is actually a literal string, process it separately to avoid*
2650	cluttering up the main loop. /*
2651
2652	if ((options & PCRE2_LITERAL) != `0`)
2653	{
2654	while (ptr < ptrend)
2655	{
2656	if (parsed_pattern >= parsed_pattern_end)
2657	{
2658	errorcode = ERR63; / Internal error (parsed pattern overflow) /
2659	goto FAILED;
2660	}
2661	thisptr = ptr;
2662	GETCHARINCTEST(c, ptr);
2663	if (auto_callout)
2664	parsed_pattern = manage_callouts(thisptr, &previous_callout,
2665	auto_callout, parsed_pattern, cb);
2666	PARSED_LITERAL(c, parsed_pattern);
2667	}
2668	goto PARSED_END;
2669	}
2670
2671	/ Process a real regex which may contain meta-characters. /
2672
2673	top_nest = NULL;
2674	end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2675
2676	/ The size of the nest_save structure might not be a factor of the size of the*
2677	workspace. Therefore we must round down end_nests so as to correctly avoid
2678	creating a nest_save that spans the end of the workspace. /*
2679
2680	end_nests = (nest_save )((char* *)end_nests -
2681	((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2682
2683	/ PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED /
2684
2685	if ((options & PCRE2_EXTENDED_MORE) != `0`) options \|= PCRE2_EXTENDED;
2686
2687	/ Now scan the pattern /
2688
2689	while (ptr < ptrend)
2690	{
2691	int prev_expect_cond_assert;
2692	uint32_t min_repeat = `0`, max_repeat = `0`;
2693	uint32_t set, unset, *optset;
2694	uint32_t terminator;
2695	uint32_t prev_meta_quantifier;
2696	BOOL prev_okquantifier;
2697	PCRE2_SPTR tempptr;
2698	PCRE2_SIZE offset;
2699
2700	if (parsed_pattern >= parsed_pattern_end)
2701	{
2702	errorcode = ERR63; / Internal error (parsed pattern overflow) /
2703	goto FAILED;
2704	}
2705
2706	if (nest_depth > cb->cx->parens_nest_limit)
2707	{
2708	errorcode = ERR19;
2709	goto FAILED; / Parentheses too deeply nested /
2710	}
2711
2712	/ Get next input character, save its position for callout handling. /
2713
2714	thisptr = ptr;
2715	GETCHARINCTEST(c, ptr);
2716
2717	/ Copy quoted literals until \E, allowing for the possibility of automatic*
2718	callouts, except when processing a (VERB) "name". /
2719
2720	if (inescq)
2721	{
2722	if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2723	{
2724	inescq = FALSE;
2725	ptr++; / Skip E /
2726	}
2727	else
2728	{
2729	if (expect_cond_assert > `0`) / A literal is not allowed if we are /
2730	{ / expecting a conditional assertion, /
2731	ptr--; / but an empty \Q\E sequence is OK. /
2732	errorcode = ERR28;
2733	goto FAILED;
2734	}
2735	if (inverbname)
2736	{ / Don't use PARSED_LITERAL() because it /
2737	#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2738	if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2739	#endif
2740	*parsed_pattern++ = c;
2741	}
2742	else
2743	{
2744	if (after_manual_callout-- <= `0`)
2745	parsed_pattern = manage_callouts(thisptr, &previous_callout,
2746	auto_callout, parsed_pattern, cb);
2747	PARSED_LITERAL(c, parsed_pattern);
2748	}
2749	meta_quantifier = `0`;
2750	}
2751	continue; / Next character /
2752	}
2753
2754	/ If we are processing the "name" part of a (VERB:NAME) item, all
2755	characters up to the closing parenthesis are literals except when
2756	PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2757	and \E and escaped characters are allowed (no character types such as \d). If
2758	PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2759	this by not entering the special (VERB:NAME) processing - they are then*
2760	picked up below. Note that c is a character, not a code unit, so we must not
2761	use MAX_255 to test its size because MAX_255 tests code units and is assumed
2762	TRUE in 8-bit mode. /*
2763
2764	if (inverbname &&
2765	(
2766	/ EITHER: not both options set /
2767	((options & (PCRE2_EXTENDED \| PCRE2_ALT_VERBNAMES)) !=
2768	(PCRE2_EXTENDED \| PCRE2_ALT_VERBNAMES)) \|\|
2769	#ifdef SUPPORT_UNICODE
2770	/ OR: character > 255 AND not Unicode Pattern White Space /
2771	(c > `255` && (c\|`1`) != `0x200f` && (c\|`1`) != `0x2029`) \|\|
2772	#endif
2773	/ OR: not a # comment or isspace() white space /
2774	(c < `256` && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == `0`
2775	#ifdef SUPPORT_UNICODE
2776	/ and not CHAR_NEL when Unicode is supported /
2777	&& c != CHAR_NEL
2778	#endif
2779	)))
2780	{
2781	PCRE2_SIZE verbnamelength;
2782
2783	switch(c)
2784	{
2785	default: / Don't use PARSED_LITERAL() because it /
2786	#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2787	if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2788	#endif
2789	*parsed_pattern++ = c;
2790	break;
2791
2792	case CHAR_RIGHT_PARENTHESIS:
2793	inverbname = FALSE;
2794	/ This is the length in characters /
2795	verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - `1`);
2796	/ But the limit on the length is in code units /
2797	if (ptr - verbnamestart - `1` > (int)MAX_MARK)
2798	{
2799	ptr--;
2800	errorcode = ERR76;
2801	goto FAILED;
2802	}
2803	*verblengthptr = (uint32_t)verbnamelength;
2804
2805	/ If this name was on a verb such as (ACCEPT) which does not continue,
2806	a (MARK) was generated for the name. We now add the original verb as the*
2807	next item. /*
2808
2809	if (add_after_mark != `0`)
2810	{
2811	*parsed_pattern++ = add_after_mark;
2812	add_after_mark = `0`;
2813	}
2814	break;
2815
2816	case CHAR_BACKSLASH:
2817	if ((options & PCRE2_ALT_VERBNAMES) != `0`)
2818	{
2819	escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2820	cb->cx->extra_options, FALSE, cb);
2821	if (errorcode != `0`) goto FAILED;
2822	}
2823	else escape = `0`; / Treat all as literal /
2824
2825	switch(escape)
2826	{
2827	case `0`: / Don't use PARSED_LITERAL() because it /
2828	#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2829	if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2830	#endif
2831	*parsed_pattern++ = c;
2832	break;
2833
2834	case ESC_Q:
2835	inescq = TRUE;
2836	break;
2837
2838	case ESC_E: / Ignore /
2839	break;
2840
2841	default:
2842	errorcode = ERR40; / Invalid in verb name /
2843	goto FAILED;
2844	}
2845	}
2846	continue; / Next character in pattern /
2847	}
2848
2849	/ Not a verb name character. At this point we must process everything that*
2850	must not change the quantification state. This is mainly comments, but we
2851	handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
2852	A+, as in Perl. An isolated \E is ignored. /*
2853
2854	if (c == CHAR_BACKSLASH && ptr < ptrend)
2855	{
2856	if (ptr == CHAR_Q \|\| ptr == CHAR_E)
2857	{
2858	inescq = *ptr == CHAR_Q;
2859	ptr++;
2860	continue;
2861	}
2862	}
2863
2864	/ Skip over whitespace and # comments in extended mode. Note that c is a*
2865	character, not a code unit, so we must not use MAX_255 to test its size
2866	because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
2867	whitespace characters are those designated as "Pattern White Space" by
2868	Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
2869	U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
2870	subset of space characters that match \h and \v. /*
2871
2872	if ((options & PCRE2_EXTENDED) != `0`)
2873	{
2874	if (c < `256` && (cb->ctypes[c] & ctype_space) != `0`) continue;
2875	#ifdef SUPPORT_UNICODE
2876	if (c == CHAR_NEL \|\| (c\|`1`) == `0x200f` \|\| (c\|`1`) == `0x2029`) continue;
2877	#endif
2878	if (c == CHAR_NUMBER_SIGN)
2879	{
2880	while (ptr < ptrend)
2881	{
2882	if (IS_NEWLINE(ptr)) / For non-fixed-length newline cases, /
2883	{ / IS_NEWLINE sets cb->nllen. /
2884	ptr += cb->nllen;
2885	break;
2886	}
2887	ptr++;
2888	#ifdef SUPPORT_UNICODE
2889	if (utf) FORWARDCHARTEST(ptr, ptrend);
2890	#endif
2891	}
2892	continue; / Next character in pattern /
2893	}
2894	}
2895
2896	/ Skip over bracketed comments /
2897
2898	if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= `2` &&
2899	ptr[`0`] == CHAR_QUESTION_MARK && ptr[`1`] == CHAR_NUMBER_SIGN)
2900	{
2901	while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
2902	if (ptr >= ptrend)
2903	{
2904	errorcode = ERR18; / A special error for missing ) in a comment /
2905	goto FAILED; / to make it easier to debug. /
2906	}
2907	ptr++;
2908	continue; / Next character in pattern /
2909	}
2910
2911	/ If the next item is not a quantifier, fill in length of any previous*
2912	callout and create an auto callout if required. /*
2913
2914	if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
2915	(c != CHAR_LEFT_CURLY_BRACKET \|\|
2916	(tempptr = ptr,
2917	!read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
2918	{
2919	if (after_manual_callout-- <= `0`)
2920	parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
2921	parsed_pattern, cb);
2922	}
2923
2924	/ If expect_cond_assert is 2, we have just passed (?( and are expecting an*
2925	assertion, possibly preceded by a callout. If the value is 1, we have just
2926	had the callout and expect an assertion. There must be at least 3 more
2927	characters in all cases. When expect_cond_assert is 2, we know that the
2928	current character is an opening parenthesis, as otherwise we wouldn't be
2929	here. However, when it is 1, we need to check, and it's easiest just to check
2930	always. Note that expect_cond_assert may be negative, since all callouts just
2931	decrement it. /*
2932
2933	if (expect_cond_assert > `0`)
2934	{
2935	BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= `3` &&
2936	(ptr[`0`] == CHAR_QUESTION_MARK \|\| ptr[`0`] == CHAR_ASTERISK);
2937	if (ok)
2938	{
2939	if (ptr[`0`] == CHAR_ASTERISK) / New alpha assertion format, possibly /
2940	{
2941	ok = MAX_255(ptr[`1`]) && (cb->ctypes[ptr[`1`]] & ctype_lcletter) != `0`;
2942	}
2943	else switch(ptr[`1`]) / Traditional symbolic format /
2944	{
2945	case CHAR_C:
2946	ok = expect_cond_assert == `2`;
2947	break;
2948
2949	case CHAR_EQUALS_SIGN:
2950	case CHAR_EXCLAMATION_MARK:
2951	break;
2952
2953	case CHAR_LESS_THAN_SIGN:
2954	ok = ptr[`2`] == CHAR_EQUALS_SIGN \|\| ptr[`2`] == CHAR_EXCLAMATION_MARK;
2955	break;
2956
2957	default:
2958	ok = FALSE;
2959	}
2960	}
2961
2962	if (!ok)
2963	{
2964	ptr--; / Adjust error offset /
2965	errorcode = ERR28;
2966	goto FAILED;
2967	}
2968	}
2969
2970	/ Remember whether we are expecting a conditional assertion, and set the*
2971	default for this item. /*
2972
2973	prev_expect_cond_assert = expect_cond_assert;
2974	expect_cond_assert = `0`;
2975
2976	/ Remember quantification status for the previous significant item, then set*
2977	default for this item. /*
2978
2979	prev_okquantifier = okquantifier;
2980	prev_meta_quantifier = meta_quantifier;
2981	okquantifier = FALSE;
2982	meta_quantifier = `0`;
2983
2984	/ If the previous significant item was a quantifier, adjust the parsed code*
2985	if there is a following modifier. The base meta value is always followed by
2986	the PLUS and QUERY values, in that order. We do this here rather than after
2987	reading a quantifier so that intervening comments and /x whitespace can be
2988	ignored without having to replicate code. /*
2989
2990	if (prev_meta_quantifier != `0` && (c == CHAR_QUESTION_MARK \|\| c == CHAR_PLUS))
2991	{
2992	parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -`3` : -`1`] =
2993	prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
2994	`0x00020000u` : `0x00010000u`);
2995	continue; / Next character in pattern /
2996	}
2997
2998
2999	/ Process the next item in the main part of a pattern. /
3000
3001	switch(c)
3002	{
3003	default: / Non-special character /
3004	PARSED_LITERAL(c, parsed_pattern);
3005	break;
3006
3007
3008	/ ---- Escape sequence ---- /
3009
3010	case CHAR_BACKSLASH:
3011	tempptr = ptr;
3012	escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3013	cb->cx->extra_options, FALSE, cb);
3014	if (errorcode != `0`)
3015	{
3016	ESCAPE_FAILED:
3017	if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == `0`)
3018	goto FAILED;
3019	ptr = tempptr;
3020	if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3021	{
3022	GETCHARINCTEST(c, ptr); / Get character value, increment pointer /
3023	}
3024	escape = `0`; / Treat as literal character /
3025	}
3026
3027	/ The escape was a data escape or literal character. /
3028
3029	if (escape == `0`)
3030	{
3031	PARSED_LITERAL(c, parsed_pattern);
3032	}
3033
3034	/ The escape was a back (or forward) reference. We keep the offset in*
3035	order to give a more useful diagnostic for a bad forward reference. For
3036	references to groups numbered less than 10 we can't use more than two items
3037	in parsed_pattern because they may be just two characters in the input (and
3038	in a 64-bit world an offset may need two elements). So for them, the offset
3039	of the first occurrent is held in a special vector. /*
3040
3041	else if (escape < `0`)
3042	{
3043	offset = (PCRE2_SIZE)(ptr - cb->start_pattern - `1`);
3044	escape = -escape;
3045	*parsed_pattern++ = META_BACKREF \| (uint32_t)escape;
3046	if (escape < `10`)
3047	{
3048	if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3049	cb->small_ref_offset[escape] = offset;
3050	}
3051	else
3052	{
3053	PUTOFFSET(offset, parsed_pattern);
3054	}
3055	okquantifier = TRUE;
3056	}
3057
3058	/ The escape was a character class such as \d etc. or other special*
3059	escape indicator such as \A or \X. Most of them generate just a single
3060	parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3061	value. They are supported only when Unicode is available. The type and
3062	value are packed into a single 32-bit value so that the whole sequences
3063	uses only two elements in the parsed_vector. This is because the same
3064	coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3065	set.
3066
3067	There are also some cases where the escape sequence is followed by a name:
3068	\k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3069	and \g'name' are subroutine calls by name; \g{name} is a synonym for
3070	\k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3071	and returned as a negative value (handled above). A name is coded as an
3072	offset into the pattern and a length. /*
3073
3074	else switch (escape)
3075	{
3076	case ESC_C:
3077	#ifdef NEVER_BACKSLASH_C
3078	errorcode = ERR85;
3079	goto ESCAPE_FAILED;
3080	#else
3081	if ((options & PCRE2_NEVER_BACKSLASH_C) != `0`)
3082	{
3083	errorcode = ERR83;
3084	goto ESCAPE_FAILED;
3085	}
3086	#endif
3087	okquantifier = TRUE;
3088	*parsed_pattern++ = META_ESCAPE + escape;
3089	break;
3090
3091	case ESC_X:
3092	#ifndef SUPPORT_UNICODE
3093	errorcode = ERR45; / Supported only with Unicode support /
3094	goto ESCAPE_FAILED;
3095	#endif
3096	case ESC_H:
3097	case ESC_h:
3098	case ESC_N:
3099	case ESC_R:
3100	case ESC_V:
3101	case ESC_v:
3102	okquantifier = TRUE;
3103	*parsed_pattern++ = META_ESCAPE + escape;
3104	break;
3105
3106	default: / \A, \B, \b, \G, \K, \Z, \z cannot be quantified. /
3107	*parsed_pattern++ = META_ESCAPE + escape;
3108	break;
3109
3110	/ Escapes that change in UCP mode. Note that PCRE2_UCP will never be set*
3111	without Unicode support because it is checked when pcre2_compile() is
3112	called. /*
3113
3114	case ESC_d:
3115	case ESC_D:
3116	case ESC_s:
3117	case ESC_S:
3118	case ESC_w:
3119	case ESC_W:
3120	okquantifier = TRUE;
3121	if ((options & PCRE2_UCP) == `0`)
3122	{
3123	*parsed_pattern++ = META_ESCAPE + escape;
3124	}
3125	else
3126	{
3127	*parsed_pattern++ = META_ESCAPE +
3128	((escape == ESC_d \|\| escape == ESC_s \|\| escape == ESC_w)?
3129	ESC_p : ESC_P);
3130	switch(escape)
3131	{
3132	case ESC_d:
3133	case ESC_D:
3134	*parsed_pattern++ = (PT_PC << `16`) \| ucp_Nd;
3135	break;
3136
3137	case ESC_s:
3138	case ESC_S:
3139	*parsed_pattern++ = PT_SPACE << `16`;
3140	break;
3141
3142	case ESC_w:
3143	case ESC_W:
3144	*parsed_pattern++ = PT_WORD << `16`;
3145	break;
3146	}
3147	}
3148	break;
3149
3150	/ Unicode property matching /
3151
3152	case ESC_P:
3153	case ESC_p:
3154	#ifdef SUPPORT_UNICODE
3155	{
3156	BOOL negated;
3157	uint16_t ptype = `0`, pdata = `0`;
3158	if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3159	goto ESCAPE_FAILED;
3160	if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3161	*parsed_pattern++ = META_ESCAPE + escape;
3162	*parsed_pattern++ = (ptype << `16`) \| pdata;
3163	okquantifier = TRUE;
3164	}
3165	#else
3166	errorcode = ERR45;
3167	goto ESCAPE_FAILED;
3168	#endif
3169	break; / End \P and \p /
3170
3171	/ When \g is used with quotes or angle brackets as delimiters, it is a*
3172	numerical or named subroutine call, and control comes here. When used
3173	with brace delimiters it is a numberical back reference and does not come
3174	here because check_escape() returns it directly as a reference. \k is
3175	always a named back reference. /*
3176
3177	case ESC_g:
3178	case ESC_k:
3179	if (ptr >= ptrend \|\| (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3180	ptr != CHAR_LESS_THAN_SIGN && ptr != CHAR_APOSTROPHE))
3181	{
3182	errorcode = (escape == ESC_g)? ERR57 : ERR69;
3183	goto ESCAPE_FAILED;
3184	}
3185	terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3186	CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3187	CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3188
3189	/ For a non-braced \g, check for a numerical recursion. /
3190
3191	if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3192	{
3193	PCRE2_SPTR p = ptr + `1`;
3194
3195	if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3196	&errorcode))
3197	{
3198	if (p >= ptrend \|\| *p != terminator)
3199	{
3200	errorcode = ERR57;
3201	goto ESCAPE_FAILED;
3202	}
3203	ptr = p;
3204	goto SET_RECURSION;
3205	}
3206	if (errorcode != `0`) goto ESCAPE_FAILED;
3207	}
3208
3209	/ Not a numerical recursion /
3210
3211	if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3212	&errorcode, cb)) goto ESCAPE_FAILED;
3213
3214	/ \k and \g when used with braces are back references, whereas \g used*
3215	with quotes or angle brackets is a recursion /*
3216
3217	*parsed_pattern++ =
3218	(escape == ESC_k \|\| terminator == CHAR_RIGHT_CURLY_BRACKET)?
3219	META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3220	*parsed_pattern++ = namelen;
3221
3222	PUTOFFSET(offset, parsed_pattern);
3223	okquantifier = TRUE;
3224	break; / End special escape processing /
3225	}
3226	break; / End escape sequence processing /
3227
3228
3229	/ ---- Single-character special items ---- /
3230
3231	case CHAR_CIRCUMFLEX_ACCENT:
3232	*parsed_pattern++ = META_CIRCUMFLEX;
3233	break;
3234
3235	case CHAR_DOLLAR_SIGN:
3236	*parsed_pattern++ = META_DOLLAR;
3237	break;
3238
3239	case CHAR_DOT:
3240	*parsed_pattern++ = META_DOT;
3241	okquantifier = TRUE;
3242	break;
3243
3244
3245	/ ---- Single-character quantifiers ---- /
3246
3247	case CHAR_ASTERISK:
3248	meta_quantifier = META_ASTERISK;
3249	goto CHECK_QUANTIFIER;
3250
3251	case CHAR_PLUS:
3252	meta_quantifier = META_PLUS;
3253	goto CHECK_QUANTIFIER;
3254
3255	case CHAR_QUESTION_MARK:
3256	meta_quantifier = META_QUERY;
3257	goto CHECK_QUANTIFIER;
3258
3259
3260	/ ---- Potential {n,m} quantifier ---- /
3261
3262	case CHAR_LEFT_CURLY_BRACKET:
3263	if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3264	&errorcode))
3265	{
3266	if (errorcode != `0`) goto FAILED; / Error in quantifier. /
3267	PARSED_LITERAL(c, parsed_pattern); / Not a quantifier /
3268	break; / No more quantifier processing /
3269	}
3270	meta_quantifier = META_MINMAX;
3271	/ Fall through /
3272
3273
3274	/ ---- Quantifier post-processing ---- /
3275
3276	/ Check that a quantifier is allowed after the previous item. /
3277
3278	CHECK_QUANTIFIER:
3279	if (!prev_okquantifier)
3280	{
3281	errorcode = ERR9;
3282	goto FAILED_BACK;
3283	}
3284
3285	/ Most (VERB)s are not allowed to be quantified, but an ungreedy
3286	quantifier can be useful for (ACCEPT) - meaning "succeed on backtrack", a*
3287	sort of negated (COMMIT). We therefore allow (ACCEPT) to be quantified by
3288	wrapping it in non-capturing brackets, but we have to allow for a preceding
3289	(MARK) for when (ACCEPT) has an argument. /*
3290
3291	if (parsed_pattern[-`1`] == META_ACCEPT)
3292	{
3293	uint32_t *p;
3294	for (p = parsed_pattern - `1`; p >= verbstartptr; p--) p[`1`] = p[`0`];
3295	*verbstartptr = META_NOCAPTURE;
3296	parsed_pattern[`1`] = META_KET;
3297	parsed_pattern += `2`;
3298	}
3299
3300	/ Now we can put the quantifier into the parsed pattern vector. At this*
3301	stage, we have only the basic quantifier. The check for a following + or ?
3302	modifier happens at the top of the loop, after any intervening comments
3303	have been removed. /*
3304
3305	*parsed_pattern++ = meta_quantifier;
3306	if (c == CHAR_LEFT_CURLY_BRACKET)
3307	{
3308	*parsed_pattern++ = min_repeat;
3309	*parsed_pattern++ = max_repeat;
3310	}
3311	break;
3312
3313
3314	/ ---- Character class ---- /
3315
3316	case CHAR_LEFT_SQUARE_BRACKET:
3317	okquantifier = TRUE;
3318
3319	/ In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is*
3320	used for "start of word" and "end of word". As these are otherwise illegal
3321	sequences, we don't break anything by recognizing them. They are replaced
3322	by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3323	erroneous and are handled by the normal code below. /*
3324
3325	if (ptrend - ptr >= `6` &&
3326	(PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, `6`) == `0` \|\|
3327	PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, `6`) == `0`))
3328	{
3329	*parsed_pattern++ = META_ESCAPE + ESC_b;
3330
3331	if (ptr[`2`] == CHAR_LESS_THAN_SIGN)
3332	{
3333	*parsed_pattern++ = META_LOOKAHEAD;
3334	}
3335	else
3336	{
3337	*parsed_pattern++ = META_LOOKBEHIND;
3338	*has_lookbehind = TRUE;
3339
3340	/ The offset is used only for the "non-fixed length" error; this won't*
3341	occur here, so just store zero. /*
3342
3343	PUTOFFSET((PCRE2_SIZE)`0`, parsed_pattern);
3344	}
3345
3346	if ((options & PCRE2_UCP) == `0`)
3347	*parsed_pattern++ = META_ESCAPE + ESC_w;
3348	else
3349	{
3350	*parsed_pattern++ = META_ESCAPE + ESC_p;
3351	*parsed_pattern++ = PT_WORD << `16`;
3352	}
3353	*parsed_pattern++ = META_KET;
3354	ptr += `6`;
3355	break;
3356	}
3357
3358	/ PCRE supports POSIX class stuff inside a class. Perl gives an error if*
3359	they are encountered at the top level, so we'll do that too. /*
3360
3361	if (ptr < ptrend && (ptr == CHAR_COLON \|\| ptr == CHAR_DOT \|\|
3362	*ptr == CHAR_EQUALS_SIGN) &&
3363	check_posix_syntax(ptr, ptrend, &tempptr))
3364	{
3365	errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3366	goto FAILED;
3367	}
3368
3369	/ Process a regular character class. If the first character is '^', set*
3370	the negation flag. If the first few characters (either before or after ^)
3371	are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3372	This makes for compatibility with Perl. /*
3373
3374	negate_class = FALSE;
3375	while (ptr < ptrend)
3376	{
3377	GETCHARINCTEST(c, ptr);
3378	if (c == CHAR_BACKSLASH)
3379	{
3380	if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3381	else if (ptrend - ptr >= `3` &&
3382	PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, `3`) == `0`)
3383	ptr += `3`;
3384	else
3385	break;
3386	}
3387	else if ((options & PCRE2_EXTENDED_MORE) != `0` &&
3388	(c == CHAR_SPACE \|\| c == CHAR_HT)) / Note: just these two /
3389	continue;
3390	else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3391	negate_class = TRUE;
3392	else break;
3393	}
3394
3395	/ Now the real contents of the class; c has the first "real" character.*
3396	Empty classes are permitted only if the option is set. /*
3397
3398	if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3399	(cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != `0`)
3400	{
3401	*parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3402	break; / End of class processing /
3403	}
3404
3405	/ Process a non-empty class. /
3406
3407	*parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3408	class_range_state = RANGE_NO;
3409
3410	/ In an EBCDIC environment, Perl treats alphabetic ranges specially*
3411	because there are holes in the encoding, and simply using the range A-Z
3412	(for example) would include the characters in the holes. This applies only
3413	to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3414	in this respect. In order to accommodate this, we keep track of whether
3415	character values are literal or not, and a state variable for handling
3416	ranges. /*
3417
3418	/ Loop for the contents of the class /
3419
3420	for (;;)
3421	{
3422	BOOL char_is_literal = TRUE;
3423
3424	/ Inside \Q...\E everything is literal except \E /
3425
3426	if (inescq)
3427	{
3428	if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3429	{
3430	inescq = FALSE; / Reset literal state /
3431	ptr++; / Skip the 'E' /
3432	goto CLASS_CONTINUE;
3433	}
3434	goto CLASS_LITERAL;
3435	}
3436
3437	/ Skip over space and tab (only) in extended-more mode. /
3438
3439	if ((options & PCRE2_EXTENDED_MORE) != `0` &&
3440	(c == CHAR_SPACE \|\| c == CHAR_HT))
3441	goto CLASS_CONTINUE;
3442
3443	/ Handle POSIX class names. Perl allows a negation extension of the*
3444	form [:^name:]. A square bracket that doesn't match the syntax is
3445	treated as a literal. We also recognize the POSIX constructions
3446	[.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3447	5.6 and 5.8 do. /*
3448
3449	if (c == CHAR_LEFT_SQUARE_BRACKET &&
3450	ptrend - ptr >= `3` &&
3451	(ptr == CHAR_COLON \|\| ptr == CHAR_DOT \|\|
3452	*ptr == CHAR_EQUALS_SIGN) &&
3453	check_posix_syntax(ptr, ptrend, &tempptr))
3454	{
3455	BOOL posix_negate = FALSE;
3456	int posix_class;
3457
3458	/ Perl treats a hyphen before a POSIX class as a literal, not the*
3459	start of a range. However, it gives a warning in its warning mode. PCRE
3460	does not have a warning mode, so we give an error, because this is
3461	likely an error on the user's part. /*
3462
3463	if (class_range_state == RANGE_STARTED)
3464	{
3465	errorcode = ERR50;
3466	goto FAILED;
3467	}
3468
3469	if (*ptr != CHAR_COLON)
3470	{
3471	errorcode = ERR13;
3472	goto FAILED_BACK;
3473	}
3474
3475	if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3476	{
3477	posix_negate = TRUE;
3478	ptr++;
3479	}
3480
3481	posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3482	if (posix_class < `0`)
3483	{
3484	errorcode = ERR30;
3485	goto FAILED;
3486	}
3487	ptr = tempptr + `2`;
3488
3489	/ Perl treats a hyphen after a POSIX class as a literal, not the*
3490	start of a range. However, it gives a warning in its warning mode
3491	unless the hyphen is the last character in the class. PCRE does not
3492	have a warning mode, so we give an error, because this is likely an
3493	error on the user's part. /*
3494
3495	if (ptr < ptrend - `1` && *ptr == CHAR_MINUS &&
3496	ptr[`1`] != CHAR_RIGHT_SQUARE_BRACKET)
3497	{
3498	errorcode = ERR50;
3499	goto FAILED;
3500	}
3501
3502	/ Set "a hyphen is not the start of a range" for the -] case, and also*
3503	in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3504	fuzzers do that kind of thing) and then* a hyphen. This causes that*
3505	hyphen to be treated as a literal. I don't think it's worth setting up
3506	special apparatus to do otherwise. /*
3507
3508	class_range_state = RANGE_NO;
3509
3510	/ When PCRE2_UCP is set, some of the POSIX classes are converted to*
3511	use Unicode properties \p or \P or, in one case, \h or \H. The
3512	substitutes table has two values per class, containing the type and
3513	value of a \p or \P item. The special cases are specified with a
3514	negative type: a non-zero value causes \h or \H to be used, and a zero
3515	value falls through to behave like a non-UCP POSIX class. /*
3516
3517	#ifdef SUPPORT_UNICODE
3518	if ((options & PCRE2_UCP) != `0`)
3519	{
3520	int ptype = posix_substitutes[`2`*posix_class];
3521	int pvalue = posix_substitutes[`2`*posix_class + `1`];
3522	if (ptype >= `0`)
3523	{
3524	*parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3525	*parsed_pattern++ = (ptype << `16`) \| pvalue;
3526	goto CLASS_CONTINUE;
3527	}
3528
3529	if (pvalue != `0`)
3530	{
3531	*parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3532	goto CLASS_CONTINUE;
3533	}
3534
3535	/ Fall through /
3536	}
3537	#endif /* SUPPORT_UNICODE */
3538
3539	/ Non-UCP POSIX class /
3540
3541	*parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3542	*parsed_pattern++ = posix_class;
3543	}
3544
3545	/ Handle potential start of range /
3546
3547	else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3548	{
3549	*parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3550	META_RANGE_LITERAL : META_RANGE_ESCAPED;
3551	class_range_state = RANGE_STARTED;
3552	}
3553
3554	/ Handle a literal character /
3555
3556	else if (c != CHAR_BACKSLASH)
3557	{
3558	CLASS_LITERAL:
3559	if (class_range_state == RANGE_STARTED)
3560	{
3561	if (c == parsed_pattern[-`2`]) / Optimize one-char range /
3562	parsed_pattern--;
3563	else if (parsed_pattern[-`2`] > c) / Check range is in order /
3564	{
3565	errorcode = ERR8;
3566	goto FAILED_BACK;
3567	}
3568	else
3569	{
3570	if (!char_is_literal && parsed_pattern[-`1`] == META_RANGE_LITERAL)
3571	parsed_pattern[-`1`] = META_RANGE_ESCAPED;
3572	PARSED_LITERAL(c, parsed_pattern);
3573	}
3574	class_range_state = RANGE_NO;
3575	}
3576	else / Potential start of range /
3577	{
3578	class_range_state = char_is_literal?
3579	RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3580	PARSED_LITERAL(c, parsed_pattern);
3581	}
3582	}
3583
3584	/ Handle escapes in a class /
3585
3586	else
3587	{
3588	tempptr = ptr;
3589	escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3590	cb->cx->extra_options, TRUE, cb);
3591
3592	if (errorcode != `0`)
3593	{
3594	if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == `0`)
3595	goto FAILED;
3596	ptr = tempptr;
3597	if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3598	{
3599	GETCHARINCTEST(c, ptr); / Get character value, increment pointer /
3600	}
3601	escape = `0`; / Treat as literal character /
3602	}
3603
3604	switch(escape)
3605	{
3606	case `0`: / Escaped character code point is in c /
3607	char_is_literal = FALSE;
3608	goto CLASS_LITERAL;
3609
3610	case ESC_b:
3611	c = CHAR_BS; / \b is backspace in a class /
3612	char_is_literal = FALSE;
3613	goto CLASS_LITERAL;
3614
3615	case ESC_Q:
3616	inescq = TRUE; / Enter literal mode /
3617	goto CLASS_CONTINUE;
3618
3619	case ESC_E: / Ignore orphan \E /
3620	goto CLASS_CONTINUE;
3621
3622	case ESC_B: / Always an error in a class /
3623	case ESC_R:
3624	case ESC_X:
3625	errorcode = ERR7;
3626	ptr--;
3627	goto FAILED;
3628	}
3629
3630	/ The second part of a range can be a single-character escape*
3631	sequence (detected above), but not any of the other escapes. Perl
3632	treats a hyphen as a literal in such circumstances. However, in Perl's
3633	warning mode, a warning is given, so PCRE now faults it, as it is
3634	almost certainly a mistake on the user's part. /*
3635
3636	if (class_range_state == RANGE_STARTED)
3637	{
3638	errorcode = ERR50;
3639	goto FAILED; / Not CLASS_ESCAPE_FAILED; always an error /
3640	}
3641
3642	/ Of the remaining escapes, only those that define characters are*
3643	allowed in a class. None may start a range. /*
3644
3645	class_range_state = RANGE_NO;
3646	switch(escape)
3647	{
3648	case ESC_N:
3649	errorcode = ERR71;
3650	goto FAILED;
3651
3652	case ESC_H:
3653	case ESC_h:
3654	case ESC_V:
3655	case ESC_v:
3656	*parsed_pattern++ = META_ESCAPE + escape;
3657	break;
3658
3659	/ These escapes are converted to Unicode property tests when*
3660	PCRE2_UCP is set. /*
3661
3662	case ESC_d:
3663	case ESC_D:
3664	case ESC_s:
3665	case ESC_S:
3666	case ESC_w:
3667	case ESC_W:
3668	if ((options & PCRE2_UCP) == `0`)
3669	{
3670	*parsed_pattern++ = META_ESCAPE + escape;
3671	}
3672	else
3673	{
3674	*parsed_pattern++ = META_ESCAPE +
3675	((escape == ESC_d \|\| escape == ESC_s \|\| escape == ESC_w)?
3676	ESC_p : ESC_P);
3677	switch(escape)
3678	{
3679	case ESC_d:
3680	case ESC_D:
3681	*parsed_pattern++ = (PT_PC << `16`) \| ucp_Nd;
3682	break;
3683
3684	case ESC_s:
3685	case ESC_S:
3686	*parsed_pattern++ = PT_SPACE << `16`;
3687	break;
3688
3689	case ESC_w:
3690	case ESC_W:
3691	*parsed_pattern++ = PT_WORD << `16`;
3692	break;
3693	}
3694	}
3695	break;
3696
3697	/ Explicit Unicode property matching /
3698
3699	case ESC_P:
3700	case ESC_p:
3701	#ifdef SUPPORT_UNICODE
3702	{
3703	BOOL negated;
3704	uint16_t ptype = `0`, pdata = `0`;
3705	if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3706	goto FAILED;
3707	if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3708	*parsed_pattern++ = META_ESCAPE + escape;
3709	*parsed_pattern++ = (ptype << `16`) \| pdata;
3710	}
3711	#else
3712	errorcode = ERR45;
3713	goto FAILED;
3714	#endif
3715	break; / End \P and \p /
3716
3717	default: / All others are not allowed in a class /
3718	errorcode = ERR7;
3719	ptr--;
3720	goto FAILED;
3721	}
3722
3723	/ Perl gives a warning unless a following hyphen is the last character*
3724	in the class. PCRE throws an error. /*
3725
3726	if (ptr < ptrend - `1` && *ptr == CHAR_MINUS &&
3727	ptr[`1`] != CHAR_RIGHT_SQUARE_BRACKET)
3728	{
3729	errorcode = ERR50;
3730	goto FAILED;
3731	}
3732	}
3733
3734	/ Proceed to next thing in the class. /
3735
3736	CLASS_CONTINUE:
3737	if (ptr >= ptrend)
3738	{
3739	errorcode = ERR6; / Missing terminating ']' /
3740	goto FAILED;
3741	}
3742	GETCHARINCTEST(c, ptr);
3743	if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3744	} / End of class-processing loop /
3745
3746	/ -] at the end of a class is a literal '-' /
3747
3748	if (class_range_state == RANGE_STARTED)
3749	{
3750	parsed_pattern[-`1`] = CHAR_MINUS;
3751	class_range_state = RANGE_NO;
3752	}
3753
3754	*parsed_pattern++ = META_CLASS_END;
3755	break; / End of character class /
3756
3757
3758	/ ---- Opening parenthesis ---- /
3759
3760	case CHAR_LEFT_PARENTHESIS:
3761	if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3762
3763	/ If ( is not followed by ? it is either a capture or a special verb or an*
3764	alpha assertion or a positive non-atomic lookahead. /*
3765
3766	if (*ptr != CHAR_QUESTION_MARK)
3767	{
3768	const char *vn;
3769
3770	/ Handle capturing brackets (or non-capturing if auto-capture is turned*
3771	off). /*
3772
3773	if (*ptr != CHAR_ASTERISK)
3774	{
3775	nest_depth++;
3776	if ((options & PCRE2_NO_AUTO_CAPTURE) == `0`)
3777	{
3778	if (cb->bracount >= MAX_GROUP_NUMBER)
3779	{
3780	errorcode = ERR97;
3781	goto FAILED;
3782	}
3783	cb->bracount++;
3784	*parsed_pattern++ = META_CAPTURE \| cb->bracount;
3785	}
3786	else *parsed_pattern++ = META_NOCAPTURE;
3787	}
3788
3789	/ Do nothing for (* followed by end of pattern or ) so it gives a "bad*
3790	quantifier" error rather than "(MARK) must have an argument". /
3791
3792	else if (ptrend - ptr <= `1` \|\| (c = ptr[`1`]) == CHAR_RIGHT_PARENTHESIS)
3793	break;
3794
3795	/ Handle "alpha assertions" such as (pla:...). Most of these are
3796	synonyms for the historical symbolic assertions, but the script run and
3797	non-atomic lookaround ones are new. They are distinguished by starting
3798	with a lower case letter. Checking both ends of the alphabet makes this
3799	work in all character codes. /*
3800
3801	else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != `0`)
3802	{
3803	uint32_t meta;
3804
3805	vn = alasnames;
3806	if (!read_name(&ptr, ptrend, utf, `0`, &offset, &name, &namelen,
3807	&errorcode, cb)) goto FAILED;
3808	if (ptr >= ptrend \|\| *ptr != CHAR_COLON)
3809	{
3810	errorcode = ERR95; / Malformed /
3811	goto FAILED;
3812	}
3813
3814	/ Scan the table of alpha assertion names /
3815
3816	for (i = `0`; i < alascount; i++)
3817	{
3818	if (namelen == alasmeta[i].len &&
3819	PRIV(strncmp_c8)(name, vn, namelen) == `0`)
3820	break;
3821	vn += alasmeta[i].len + `1`;
3822	}
3823
3824	if (i >= alascount)
3825	{
3826	errorcode = ERR95; / Alpha assertion not recognized /
3827	goto FAILED;
3828	}
3829
3830	/ Check for expecting an assertion condition. If so, only atomic*
3831	lookaround assertions are valid. /*
3832
3833	meta = alasmeta[i].meta;
3834	if (prev_expect_cond_assert > `0` &&
3835	(meta < META_LOOKAHEAD \|\| meta > META_LOOKBEHINDNOT))
3836	{
3837	errorcode = (meta == META_LOOKAHEAD_NA \|\| meta == META_LOOKBEHIND_NA)?
3838	ERR98 : ERR28; / (Atomic) assertion expected /
3839	goto FAILED;
3840	}
3841
3842	/ The lookaround alphabetic synonyms can mostly be handled by jumping*
3843	to the code that handles the traditional symbolic forms. /*
3844
3845	switch(meta)
3846	{
3847	default:
3848	errorcode = ERR89; / Unknown code; should never occur because /
3849	goto FAILED; / the meta values come from a table above. /
3850
3851	case META_ATOMIC:
3852	goto ATOMIC_GROUP;
3853
3854	case META_LOOKAHEAD:
3855	goto POSITIVE_LOOK_AHEAD;
3856
3857	case META_LOOKAHEAD_NA:
3858	goto POSITIVE_NONATOMIC_LOOK_AHEAD;
3859
3860	case META_LOOKAHEADNOT:
3861	goto NEGATIVE_LOOK_AHEAD;
3862
3863	case META_LOOKBEHIND:
3864	case META_LOOKBEHINDNOT:
3865	case META_LOOKBEHIND_NA:
3866	*parsed_pattern++ = meta;
3867	ptr--;
3868	goto POST_LOOKBEHIND;
3869
3870	/ The script run facilities are handled here. Unicode support is*
3871	required (give an error if not, as this is a security issue). Always
3872	record a META_SCRIPT_RUN item. Then, for the atomic version, insert
3873	META_ATOMIC and remember that we need two META_KETs at the end. /*
3874
3875	case META_SCRIPT_RUN:
3876	case META_ATOMIC_SCRIPT_RUN:
3877	#ifdef SUPPORT_UNICODE
3878	*parsed_pattern++ = META_SCRIPT_RUN;
3879	nest_depth++;
3880	ptr++;
3881	if (meta == META_ATOMIC_SCRIPT_RUN)
3882	{
3883	*parsed_pattern++ = META_ATOMIC;
3884	if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3885	else if (++top_nest >= end_nests)
3886	{
3887	errorcode = ERR84;
3888	goto FAILED;
3889	}
3890	top_nest->nest_depth = nest_depth;
3891	top_nest->flags = NSF_ATOMICSR;
3892	top_nest->options = options & PARSE_TRACKED_OPTIONS;
3893	}
3894	break;
3895	#else /* SUPPORT_UNICODE */
3896	errorcode = ERR96;
3897	goto FAILED;
3898	#endif
3899	}
3900	}
3901
3902
3903	/ ---- Handle (VERB) and (VERB:NAME) ---- /
3904
3905	else
3906	{
3907	vn = verbnames;
3908	if (!read_name(&ptr, ptrend, utf, `0`, &offset, &name, &namelen,
3909	&errorcode, cb)) goto FAILED;
3910	if (ptr >= ptrend \|\| (*ptr != CHAR_COLON &&
3911	*ptr != CHAR_RIGHT_PARENTHESIS))
3912	{
3913	errorcode = ERR60; / Malformed /
3914	goto FAILED;
3915	}
3916
3917	/ Scan the table of verb names /
3918
3919	for (i = `0`; i < verbcount; i++)
3920	{
3921	if (namelen == verbs[i].len &&
3922	PRIV(strncmp_c8)(name, vn, namelen) == `0`)
3923	break;
3924	vn += verbs[i].len + `1`;
3925	}
3926
3927	if (i >= verbcount)
3928	{
3929	errorcode = ERR60; / Verb not recognized /
3930	goto FAILED;
3931	}
3932
3933	/ An empty argument is treated as no argument. /
3934
3935	if (*ptr == CHAR_COLON && ptr + `1` < ptrend &&
3936	ptr[`1`] == CHAR_RIGHT_PARENTHESIS)
3937	ptr++; / Advance to the closing parens /
3938
3939	/ Check for mandatory non-empty argument; this is (MARK) /*
3940
3941	if (verbs[i].has_arg > `0` && *ptr != CHAR_COLON)
3942	{
3943	errorcode = ERR66;
3944	goto FAILED;
3945	}
3946
3947	/ Remember where this verb, possibly with a preceding (MARK), starts,
3948	for handling quantified (ACCEPT). /
3949
3950	verbstartptr = parsed_pattern;
3951	okquantifier = (verbs[i].meta == META_ACCEPT);
3952
3953	/ It appears that Perl allows any characters whatsoever, other than a*
3954	closing parenthesis, to appear in arguments ("names"), so we no longer
3955	insist on letters, digits, and underscores. Perl does not, however, do
3956	any interpretation within arguments, and has no means of including a
3957	closing parenthesis. PCRE supports escape processing but only when it
3958	is requested by an option. We set inverbname TRUE here, and let the
3959	main loop take care of this so that escape and \x processing is done by
3960	the main code above. /*
3961
3962	if (ptr++ == CHAR_COLON) /* Skip past : or ) /
3963	{
3964	/ Some optional arguments can be treated as a preceding (MARK) /*
3965
3966	if (verbs[i].has_arg < `0`)
3967	{
3968	add_after_mark = verbs[i].meta;
3969	*parsed_pattern++ = META_MARK;
3970	}
3971
3972	/ The remaining verbs with arguments (except MARK) need a different
3973	opcode. /*
3974
3975	else
3976	{
3977	*parsed_pattern++ = verbs[i].meta +
3978	((verbs[i].meta != META_MARK)? `0x00010000u`:`0`);
3979	}
3980
3981	/ Set up for reading the name in the main loop. /
3982
3983	verblengthptr = parsed_pattern++;
3984	verbnamestart = ptr;
3985	inverbname = TRUE;
3986	}
3987	else / No verb "name" argument /
3988	{
3989	*parsed_pattern++ = verbs[i].meta;
3990	}
3991	} / End of (VERB) handling /*
3992	break; / Done with this parenthesis /
3993	} / End of groups that don't start with (? /
3994
3995
3996	/ ---- Items starting (? ---- /
3997
3998	/ The type of item is determined by what follows (?. Handle (?\| and option*
3999	changes under "default" because both need a new block on the nest stack.
4000	Comments starting with (?# are handled above. Note that there is some
4001	ambiguity about the sequence (?- because if a digit follows it's a relative
4002	recursion or subroutine call whereas otherwise it's an option unsetting. /*
4003
4004	if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4005
4006	switch(*ptr)
4007	{
4008	default:
4009	if (*ptr == CHAR_MINUS && ptrend - ptr > `1` && IS_DIGIT(ptr[`1`]))
4010	goto RECURSION_BYNUMBER; / The + case is handled by CHAR_PLUS /
4011
4012	/ We now have either (?\| or a (possibly empty) option setting,*
4013	optionally followed by a non-capturing group. /*
4014
4015	nest_depth++;
4016	if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4017	else if (++top_nest >= end_nests)
4018	{
4019	errorcode = ERR84;
4020	goto FAILED;
4021	}
4022	top_nest->nest_depth = nest_depth;
4023	top_nest->flags = `0`;
4024	top_nest->options = options & PARSE_TRACKED_OPTIONS;
4025
4026	/ Start of non-capturing group that resets the capture count for each*
4027	branch. /*
4028
4029	if (*ptr == CHAR_VERTICAL_LINE)
4030	{
4031	top_nest->reset_group = (uint16_t)cb->bracount;
4032	top_nest->max_group = (uint16_t)cb->bracount;
4033	top_nest->flags \|= NSF_RESET;
4034	cb->external_flags \|= PCRE2_DUPCAPUSED;
4035	*parsed_pattern++ = META_NOCAPTURE;
4036	ptr++;
4037	}
4038
4039	/ Scan for options imnsxJU to be set or unset. /
4040
4041	else
4042	{
4043	BOOL hyphenok = TRUE;
4044	uint32_t oldoptions = options;
4045
4046	top_nest->reset_group = `0`;
4047	top_nest->max_group = `0`;
4048	set = unset = `0`;
4049	optset = &set;
4050
4051	/ ^ at the start unsets imnsx and disables the subsequent use of - /
4052
4053	if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
4054	{
4055	options &= ~(PCRE2_CASELESS\|PCRE2_MULTILINE\|PCRE2_NO_AUTO_CAPTURE\|
4056	PCRE2_DOTALL\|PCRE2_EXTENDED\|PCRE2_EXTENDED_MORE);
4057	hyphenok = FALSE;
4058	ptr++;
4059	}
4060
4061	while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
4062	*ptr != CHAR_COLON)
4063	{
4064	switch (*ptr++)
4065	{
4066	case CHAR_MINUS:
4067	if (!hyphenok)
4068	{
4069	errorcode = ERR94;
4070	ptr--; / Correct the offset /
4071	goto FAILED;
4072	}
4073	optset = &unset;
4074	hyphenok = FALSE;
4075	break;
4076
4077	case CHAR_J: / Record that it changed in the external options /
4078	*optset \|= PCRE2_DUPNAMES;
4079	cb->external_flags \|= PCRE2_JCHANGED;
4080	break;
4081
4082	case CHAR_i: optset \|= PCRE2_CASELESS; break*;
4083	case CHAR_m: optset \|= PCRE2_MULTILINE; break*;
4084	case CHAR_n: optset \|= PCRE2_NO_AUTO_CAPTURE; break*;
4085	case CHAR_s: optset \|= PCRE2_DOTALL; break*;
4086	case CHAR_U: optset \|= PCRE2_UNGREEDY; break*;
4087
4088	/ If x appears twice it sets the extended extended option. /
4089
4090	case CHAR_x:
4091	*optset \|= PCRE2_EXTENDED;
4092	if (ptr < ptrend && *ptr == CHAR_x)
4093	{
4094	*optset \|= PCRE2_EXTENDED_MORE;
4095	ptr++;
4096	}
4097	break;
4098
4099	default:
4100	errorcode = ERR11;
4101	ptr--; / Correct the offset /
4102	goto FAILED;
4103	}
4104	}
4105
4106	/ If we are setting extended without extended-more, ensure that any*
4107	existing extended-more gets unset. Also, unsetting extended must also
4108	unset extended-more. /*
4109
4110	if ((set & (PCRE2_EXTENDED\|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED \|\|
4111	(unset & PCRE2_EXTENDED) != `0`)
4112	unset \|= PCRE2_EXTENDED_MORE;
4113
4114	options = (options \| set) & (~unset);
4115
4116	/ If the options ended with ')' this is not the start of a nested*
4117	group with option changes, so the options change at this level.
4118	In this case, if the previous level set up a nest block, discard the
4119	one we have just created. Otherwise adjust it for the previous level.
4120	If the options ended with ':' we are starting a non-capturing group,
4121	possibly with an options setting. /*
4122
4123	if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4124	if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
4125	{
4126	nest_depth--; / This is not a nested group after all. /
4127	if (top_nest > (nest_save *)(cb->start_workspace) &&
4128	(top_nest-`1`)->nest_depth == nest_depth) top_nest--;
4129	else top_nest->nest_depth = nest_depth;
4130	}
4131	else *parsed_pattern++ = META_NOCAPTURE;
4132
4133	/ If nothing changed, no need to record. /
4134
4135	if (options != oldoptions)
4136	{
4137	*parsed_pattern++ = META_OPTIONS;
4138	*parsed_pattern++ = options;
4139	}
4140	} / End options processing /
4141	break; / End default case after (? /
4142
4143
4144	/ ---- Python syntax support ---- /
4145
4146	case CHAR_P:
4147	if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4148
4149	/ (?P<name> is the same as (?<name>, which defines a named group. /
4150
4151	if (*ptr == CHAR_LESS_THAN_SIGN)
4152	{
4153	terminator = CHAR_GREATER_THAN_SIGN;
4154	goto DEFINE_NAME;
4155	}
4156
4157	/ (?P>name) is the same as (?&name), which is a recursion or subroutine*
4158	call. /*
4159
4160	if (ptr == CHAR_GREATER_THAN_SIGN) goto* RECURSE_BY_NAME;
4161
4162	/ (?P=name) is the same as \k<name>, a back reference by name. Anything*
4163	else after (?P is an error. /*
4164
4165	if (*ptr != CHAR_EQUALS_SIGN)
4166	{
4167	errorcode = ERR41;
4168	goto FAILED;
4169	}
4170	if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4171	&namelen, &errorcode, cb)) goto FAILED;
4172	*parsed_pattern++ = META_BACKREF_BYNAME;
4173	*parsed_pattern++ = namelen;
4174	PUTOFFSET(offset, parsed_pattern);
4175	okquantifier = TRUE;
4176	break; / End of (?P processing /
4177
4178
4179	/ ---- Recursion/subroutine calls by number ---- /
4180
4181	case CHAR_R:
4182	i = `0`; / (?R) == (?R0) /
4183	ptr++;
4184	if (ptr >= ptrend \|\| *ptr != CHAR_RIGHT_PARENTHESIS)
4185	{
4186	errorcode = ERR58;
4187	goto FAILED;
4188	}
4189	goto SET_RECURSION;
4190
4191	/ An item starting (?- followed by a digit comes here via the "default"*
4192	case because (?- followed by a non-digit is an options setting. /*
4193
4194	case CHAR_PLUS:
4195	if (ptrend - ptr < `2` \|\| !IS_DIGIT(ptr[`1`]))
4196	{
4197	errorcode = ERR29; / Missing number /
4198	goto FAILED;
4199	}
4200	/ Fall through /
4201
4202	case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4203	case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4204	RECURSION_BYNUMBER:
4205	if (!read_number(&ptr, ptrend,
4206	(IS_DIGIT(ptr))? -`1`:(int)(cb->bracount), /* + and - are relative /
4207	MAX_GROUP_NUMBER, ERR61,
4208	&i, &errorcode)) goto FAILED;
4209	if (i < `0`) / NB (?0) is permitted /
4210	{
4211	errorcode = ERR15; / Unknown group /
4212	goto FAILED_BACK;
4213	}
4214	if (ptr >= ptrend \|\| *ptr != CHAR_RIGHT_PARENTHESIS)
4215	goto UNCLOSED_PARENTHESIS;
4216
4217	SET_RECURSION:
4218	*parsed_pattern++ = META_RECURSE \| (uint32_t)i;
4219	offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4220	ptr++;
4221	PUTOFFSET(offset, parsed_pattern);
4222	okquantifier = TRUE;
4223	break; / End of recursive call by number handling /
4224
4225
4226	/ ---- Recursion/subroutine calls by name ---- /
4227
4228	case CHAR_AMPERSAND:
4229	RECURSE_BY_NAME:
4230	if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4231	&namelen, &errorcode, cb)) goto FAILED;
4232	*parsed_pattern++ = META_RECURSE_BYNAME;
4233	*parsed_pattern++ = namelen;
4234	PUTOFFSET(offset, parsed_pattern);
4235	okquantifier = TRUE;
4236	break;
4237
4238	/ ---- Callout with numerical or string argument ---- /
4239
4240	case CHAR_C:
4241	if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4242
4243	/ If the previous item was a condition starting (?(? an assertion,*
4244	optionally preceded by a callout, is expected. This is checked later on,
4245	during actual compilation. However we need to identify this kind of
4246	assertion in this pass because it must not be qualified. The value of
4247	expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4248	for a callout - still leaving a positive value that identifies the
4249	assertion. Multiple callouts or any other items will make it zero or
4250	less, which doesn't matter because they will cause an error later. /*
4251
4252	expect_cond_assert = prev_expect_cond_assert - `1`;
4253
4254	/ If previous_callout is not NULL, it means this follows a previous*
4255	callout. If it was a manual callout, do nothing; this means its "length
4256	of next pattern item" field will remain zero. If it was an automatic
4257	callout, abolish it. /*
4258
4259	if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != `0` &&
4260	previous_callout == parsed_pattern - `4` &&
4261	parsed_pattern[-`1`] == `255`)
4262	parsed_pattern = previous_callout;
4263
4264	/ Save for updating next pattern item length, and skip one item before*
4265	completing. /*
4266
4267	previous_callout = parsed_pattern;
4268	after_manual_callout = `1`;
4269
4270	/ Handle a string argument; specific delimiter is required. /
4271
4272	if (ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(ptr))
4273	{
4274	PCRE2_SIZE calloutlength;
4275	PCRE2_SPTR startptr = ptr;
4276
4277	delimiter = `0`;
4278	for (i = `0`; PRIV(callout_start_delims)[i] != `0`; i++)
4279	{
4280	if (*ptr == PRIV(callout_start_delims)[i])
4281	{
4282	delimiter = PRIV(callout_end_delims)[i];
4283	break;
4284	}
4285	}
4286	if (delimiter == `0`)
4287	{
4288	errorcode = ERR82;
4289	goto FAILED;
4290	}
4291
4292	*parsed_pattern = META_CALLOUT_STRING;
4293	parsed_pattern += `3`; / Skip pattern info /
4294
4295	for (;;)
4296	{
4297	if (++ptr >= ptrend)
4298	{
4299	errorcode = ERR81;
4300	ptr = startptr; / To give a more useful message /
4301	goto FAILED;
4302	}
4303	if (ptr == delimiter && (++ptr >= ptrend \|\| ptr != delimiter))
4304	break;
4305	}
4306
4307	calloutlength = (PCRE2_SIZE)(ptr - startptr);
4308	if (calloutlength > UINT32_MAX)
4309	{
4310	errorcode = ERR72;
4311	goto FAILED;
4312	}
4313	*parsed_pattern++ = (uint32_t)calloutlength;
4314	offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4315	PUTOFFSET(offset, parsed_pattern);
4316	}
4317
4318	/ Handle a callout with an optional numerical argument, which must be*
4319	less than or equal to 255. A missing argument gives 0. /*
4320
4321	else
4322	{
4323	int n = `0`;
4324	parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout /
4325	parsed_pattern += `3`; / Skip pattern info /
4326	while (ptr < ptrend && IS_DIGIT(*ptr))
4327	{
4328	n = n * `10` + *ptr++ - CHAR_0;
4329	if (n > `255`)
4330	{
4331	errorcode = ERR38;
4332	goto FAILED;
4333	}
4334	}
4335	*parsed_pattern++ = n;
4336	}
4337
4338	/ Both formats must have a closing parenthesis /
4339
4340	if (ptr >= ptrend \|\| *ptr != CHAR_RIGHT_PARENTHESIS)
4341	{
4342	errorcode = ERR39;
4343	goto FAILED;
4344	}
4345	ptr++;
4346
4347	/ Remember the offset to the next item in the pattern, and set a default*
4348	length. This should get updated after the next item is read. /*
4349
4350	previous_callout[`1`] = (uint32_t)(ptr - cb->start_pattern);
4351	previous_callout[`2`] = `0`;
4352	break; / End callout /
4353
4354
4355	/ ---- Conditional group ---- /
4356
4357	/ A condition can be an assertion, a number (referring to a numbered*
4358	group's having been set), a name (referring to a named group), or 'R',
4359	referring to overall recursion. R<digits> and R&name are also permitted
4360	for recursion state tests. Numbers may be preceded by + or - to specify a
4361	relative group number.
4362
4363	There are several syntaxes for testing a named group: (?(name)) is used
4364	by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4365
4366	There are two unfortunate ambiguities. 'R' can be the recursive thing or
4367	the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4368	the Perl DEFINE feature or the Python named test. We look for a name
4369	first; if not found, we try the other case.
4370
4371	For compatibility with auto-callouts, we allow a callout to be specified
4372	before a condition that is an assertion. /*
4373
4374	case CHAR_LEFT_PARENTHESIS:
4375	if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4376	nest_depth++;
4377
4378	/ If the next character is ? or * there must be an assertion next*
4379	(optionally preceded by a callout). We do not check this here, but
4380	instead we set expect_cond_assert to 2. If this is still greater than
4381	zero (callouts decrement it) when the next assertion is read, it will be
4382	marked as a condition that must not be repeated. A value greater than
4383	zero also causes checking that an assertion (possibly with callout)
4384	follows. /*
4385
4386	if (ptr == CHAR_QUESTION_MARK \|\| ptr == CHAR_ASTERISK)
4387	{
4388	*parsed_pattern++ = META_COND_ASSERT;
4389	ptr--; / Pull pointer back to the opening parenthesis. /
4390	expect_cond_assert = `2`;
4391	break; / End of conditional /
4392	}
4393
4394	/ Handle (?([+-]number)... /
4395
4396	if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4397	&errorcode))
4398	{
4399	if (i <= `0`)
4400	{
4401	errorcode = ERR15;
4402	goto FAILED;
4403	}
4404	*parsed_pattern++ = META_COND_NUMBER;
4405	offset = (PCRE2_SIZE)(ptr - cb->start_pattern - `2`);
4406	PUTOFFSET(offset, parsed_pattern);
4407	*parsed_pattern++ = i;
4408	}
4409	else if (errorcode != `0`) goto FAILED; / Number too big /
4410
4411	/ No number found. Handle the special case (?(VERSION[>]=n.m)... /
4412
4413	else if (ptrend - ptr >= `10` &&
4414	PRIV(strncmp_c8)(ptr, STRING_VERSION, `7`) == `0` &&
4415	ptr[`7`] != CHAR_RIGHT_PARENTHESIS)
4416	{
4417	uint32_t ge = `0`;
4418	int major = `0`;
4419	int minor = `0`;
4420
4421	ptr += `7`;
4422	if (*ptr == CHAR_GREATER_THAN_SIGN)
4423	{
4424	ge = `1`;
4425	ptr++;
4426	}
4427
4428	/ NOTE: cannot write IS_DIGIT((++ptr)) here because IS_DIGIT
4429	references its argument twice. /*
4430
4431	if (ptr != CHAR_EQUALS_SIGN \|\| (ptr++, !IS_DIGIT(ptr)))
4432	goto BAD_VERSION_CONDITION;
4433
4434	if (!read_number(&ptr, ptrend, -`1`, `1000`, ERR79, &major, &errorcode))
4435	goto FAILED;
4436
4437	if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4438	if (*ptr == CHAR_DOT)
4439	{
4440	if (++ptr >= ptrend \|\| !IS_DIGIT(ptr)) goto* BAD_VERSION_CONDITION;
4441	minor = (ptr++ - CHAR_0) `10`;
4442	if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4443	if (IS_DIGIT(ptr)) minor += ptr++ - CHAR_0;
4444	if (ptr >= ptrend \|\| *ptr != CHAR_RIGHT_PARENTHESIS)
4445	goto BAD_VERSION_CONDITION;
4446	}
4447
4448	*parsed_pattern++ = META_COND_VERSION;
4449	*parsed_pattern++ = ge;
4450	*parsed_pattern++ = major;
4451	*parsed_pattern++ = minor;
4452	}
4453
4454	/ All the remaining cases now require us to read a name. We cannot at*
4455	this stage distinguish ambiguous cases such as (?(R12) which might be a
4456	recursion test by number or a name, because the named groups have not yet
4457	all been identified. Those cases are treated as names, but given a
4458	different META code. /*
4459
4460	else
4461	{
4462	BOOL was_r_ampersand = FALSE;
4463
4464	if (*ptr == CHAR_R && ptrend - ptr > `1` && ptr[`1`] == CHAR_AMPERSAND)
4465	{
4466	terminator = CHAR_RIGHT_PARENTHESIS;
4467	was_r_ampersand = TRUE;
4468	ptr++;
4469	}
4470	else if (*ptr == CHAR_LESS_THAN_SIGN)
4471	terminator = CHAR_GREATER_THAN_SIGN;
4472	else if (*ptr == CHAR_APOSTROPHE)
4473	terminator = CHAR_APOSTROPHE;
4474	else
4475	{
4476	terminator = CHAR_RIGHT_PARENTHESIS;
4477	ptr--; / Point to char before name /
4478	}
4479	if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4480	&errorcode, cb)) goto FAILED;
4481
4482	/ Handle (?(R&name) /
4483
4484	if (was_r_ampersand)
4485	{
4486	*parsed_pattern = META_COND_RNAME;
4487	ptr--; / Back to closing parens /
4488	}
4489
4490	/ Handle (?(name). If the name is "DEFINE" we identify it with a*
4491	special code. Likewise if the name consists of R followed only by
4492	digits. Otherwise, handle it like a quoted name. /*
4493
4494	else if (terminator == CHAR_RIGHT_PARENTHESIS)
4495	{
4496	if (namelen == `6` && PRIV(strncmp_c8)(name, STRING_DEFINE, `6`) == `0`)
4497	*parsed_pattern = META_COND_DEFINE;
4498	else
4499	{
4500	for (i = `1`; i < (int)namelen; i++)
4501	if (!IS_DIGIT(name[i])) break;
4502	parsed_pattern = (name == CHAR_R && i >= (int)namelen)?
4503	META_COND_RNUMBER : META_COND_NAME;
4504	}
4505	ptr--; / Back to closing parens /
4506	}
4507
4508	/ Handle (?('name') or (?(<name>) /
4509
4510	else *parsed_pattern = META_COND_NAME;
4511
4512	/ All these cases except DEFINE end with the name length and offset;*
4513	DEFINE just has an offset (for the "too many branches" error). /*
4514
4515	if (parsed_pattern++ != META_COND_DEFINE) parsed_pattern++ = namelen;
4516	PUTOFFSET(offset, parsed_pattern);
4517	} / End cases that read a name /
4518
4519	/ Check the closing parenthesis of the condition /
4520
4521	if (ptr >= ptrend \|\| *ptr != CHAR_RIGHT_PARENTHESIS)
4522	{
4523	errorcode = ERR24;
4524	goto FAILED;
4525	}
4526	ptr++;
4527	break; / End of condition processing /
4528
4529
4530	/ ---- Atomic group ---- /
4531
4532	case CHAR_GREATER_THAN_SIGN:
4533	ATOMIC_GROUP: / Come from (atomic: /*
4534	*parsed_pattern++ = META_ATOMIC;
4535	nest_depth++;
4536	ptr++;
4537	break;
4538
4539
4540	/ ---- Lookahead assertions ---- /
4541
4542	case CHAR_EQUALS_SIGN:
4543	POSITIVE_LOOK_AHEAD: / Come from (pla: /*
4544	*parsed_pattern++ = META_LOOKAHEAD;
4545	ptr++;
4546	goto POST_ASSERTION;
4547
4548	case CHAR_ASTERISK:
4549	POSITIVE_NONATOMIC_LOOK_AHEAD: / Come from (?* /
4550	*parsed_pattern++ = META_LOOKAHEAD_NA;
4551	ptr++;
4552	goto POST_ASSERTION;
4553
4554	case CHAR_EXCLAMATION_MARK:
4555	NEGATIVE_LOOK_AHEAD: / Come from (nla: /*
4556	*parsed_pattern++ = META_LOOKAHEADNOT;
4557	ptr++;
4558	goto POST_ASSERTION;
4559
4560
4561	/ ---- Lookbehind assertions ---- /
4562
4563	/ (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<*
4564	is the start of the name of a capturing group. /*
4565
4566	case CHAR_LESS_THAN_SIGN:
4567	if (ptrend - ptr <= `1` \|\|
4568	(ptr[`1`] != CHAR_EQUALS_SIGN &&
4569	ptr[`1`] != CHAR_EXCLAMATION_MARK &&
4570	ptr[`1`] != CHAR_ASTERISK))
4571	{
4572	terminator = CHAR_GREATER_THAN_SIGN;
4573	goto DEFINE_NAME;
4574	}
4575	*parsed_pattern++ = (ptr[`1`] == CHAR_EQUALS_SIGN)?
4576	META_LOOKBEHIND : (ptr[`1`] == CHAR_EXCLAMATION_MARK)?
4577	META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
4578
4579	POST_LOOKBEHIND: / Come from (plb: (naplb: and (nlb: /*
4580	*has_lookbehind = TRUE;
4581	offset = (PCRE2_SIZE)(ptr - cb->start_pattern - `2`);
4582	PUTOFFSET(offset, parsed_pattern);
4583	ptr += `2`;
4584	/ Fall through /
4585
4586	/ If the previous item was a condition starting (?(? an assertion,*
4587	optionally preceded by a callout, is expected. This is checked later on,
4588	during actual compilation. However we need to identify this kind of
4589	assertion in this pass because it must not be qualified. The value of
4590	expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4591	for a callout - still leaving a positive value that identifies the
4592	assertion. Multiple callouts or any other items will make it zero or
4593	less, which doesn't matter because they will cause an error later. /*
4594
4595	POST_ASSERTION:
4596	nest_depth++;
4597	if (prev_expect_cond_assert > `0`)
4598	{
4599	if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4600	else if (++top_nest >= end_nests)
4601	{
4602	errorcode = ERR84;
4603	goto FAILED;
4604	}
4605	top_nest->nest_depth = nest_depth;
4606	top_nest->flags = NSF_CONDASSERT;
4607	top_nest->options = options & PARSE_TRACKED_OPTIONS;
4608	}
4609	break;
4610
4611
4612	/ ---- Define a named group ---- /
4613
4614	/ A named group may be defined as (?'name') or (?<name>). In the latter*
4615	case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4616	terminator set to '>'. /*
4617
4618	case CHAR_APOSTROPHE:
4619	terminator = CHAR_APOSTROPHE; / Terminator /
4620
4621	DEFINE_NAME:
4622	if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4623	&errorcode, cb)) goto FAILED;
4624
4625	/ We have a name for this capturing group. It is also assigned a number,*
4626	which is its primary means of identification. /*
4627
4628	if (cb->bracount >= MAX_GROUP_NUMBER)
4629	{
4630	errorcode = ERR97;
4631	goto FAILED;
4632	}
4633	cb->bracount++;
4634	*parsed_pattern++ = META_CAPTURE \| cb->bracount;
4635	nest_depth++;
4636
4637	/ Check not too many names /
4638
4639	if (cb->names_found >= MAX_NAME_COUNT)
4640	{
4641	errorcode = ERR49;
4642	goto FAILED;
4643	}
4644
4645	/ Adjust the entry size to accommodate the longest name found. /
4646
4647	if (namelen + IMM2_SIZE + `1` > cb->name_entry_size)
4648	cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + `1`);
4649
4650	/ Scan the list to check for duplicates. For duplicate names, if the*
4651	number is the same, break the loop, which causes the name to be
4652	discarded; otherwise, if DUPNAMES is not set, give an error.
4653	If it is set, allow the name with a different number, but continue
4654	scanning in case this is a duplicate with the same number. For
4655	non-duplicate names, give an error if the number is duplicated. /*
4656
4657	isdupname = FALSE;
4658	ng = cb->named_groups;
4659	for (i = `0`; i < cb->names_found; i++, ng++)
4660	{
4661	if (namelen == ng->length &&
4662	PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == `0`)
4663	{
4664	if (ng->number == cb->bracount) break;
4665	if ((options & PCRE2_DUPNAMES) == `0`)
4666	{
4667	errorcode = ERR43;
4668	goto FAILED;
4669	}
4670	isdupname = ng->isdup = TRUE; / Mark as a duplicate /
4671	cb->dupnames = TRUE; / Duplicate names exist /
4672	}
4673	else if (ng->number == cb->bracount)
4674	{
4675	errorcode = ERR65;
4676	goto FAILED;
4677	}
4678	}
4679
4680	if (i < cb->names_found) break; / Ignore duplicate with same number /
4681
4682	/ Increase the list size if necessary /
4683
4684	if (cb->names_found >= cb->named_group_list_size)
4685	{
4686	uint32_t newsize = cb->named_group_list_size * `2`;
4687	named_group *newspace =
4688	cb->cx->memctl.malloc(newsize * sizeof(named_group),
4689	cb->cx->memctl.memory_data);
4690	if (newspace == NULL)
4691	{
4692	errorcode = ERR21;
4693	goto FAILED;
4694	}
4695
4696	memcpy(newspace, cb->named_groups,
4697	cb->named_group_list_size * sizeof(named_group));
4698	if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4699	cb->cx->memctl.free((void *)cb->named_groups,
4700	cb->cx->memctl.memory_data);
4701	cb->named_groups = newspace;
4702	cb->named_group_list_size = newsize;
4703	}
4704
4705	/ Add this name to the list /
4706
4707	cb->named_groups[cb->names_found].name = name;
4708	cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4709	cb->named_groups[cb->names_found].number = cb->bracount;
4710	cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4711	cb->names_found++;
4712	break;
4713	} / End of (? switch /
4714	break; / End of ( handling /
4715
4716
4717	/ ---- Branch terminators ---- /
4718
4719	/ Alternation: reset the capture count if we are in a (?\| group. /
4720
4721	case CHAR_VERTICAL_LINE:
4722	if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4723	(top_nest->flags & NSF_RESET) != `0`)
4724	{
4725	if (cb->bracount > top_nest->max_group)
4726	top_nest->max_group = (uint16_t)cb->bracount;
4727	cb->bracount = top_nest->reset_group;
4728	}
4729	*parsed_pattern++ = META_ALT;
4730	break;
4731
4732	/ End of group; reset the capture count to the maximum if we are in a (?\|*
4733	group and/or reset the options that are tracked during parsing. Disallow
4734	quantifier for a condition that is an assertion. /*
4735
4736	case CHAR_RIGHT_PARENTHESIS:
4737	okquantifier = TRUE;
4738	if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4739	{
4740	options = (options & ~PARSE_TRACKED_OPTIONS) \| top_nest->options;
4741	if ((top_nest->flags & NSF_RESET) != `0` &&
4742	top_nest->max_group > cb->bracount)
4743	cb->bracount = top_nest->max_group;
4744	if ((top_nest->flags & NSF_CONDASSERT) != `0`)
4745	okquantifier = FALSE;
4746
4747	if ((top_nest->flags & NSF_ATOMICSR) != `0`)
4748	{
4749	*parsed_pattern++ = META_KET;
4750	}
4751
4752	if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4753	else top_nest--;
4754	}
4755	if (nest_depth == `0`) / Unmatched closing parenthesis /
4756	{
4757	errorcode = ERR22;
4758	goto FAILED_BACK;
4759	}
4760	nest_depth--;
4761	*parsed_pattern++ = META_KET;
4762	break;
4763	} / End of switch on pattern character /
4764	} / End of main character scan loop /
4765
4766	/ End of pattern reached. Check for missing ) at the end of a verb name. /
4767
4768	if (inverbname && ptr >= ptrend)
4769	{
4770	errorcode = ERR60;
4771	goto FAILED;
4772	}
4773
4774	/ Manage callout for the final item /
4775
4776	PARSED_END:
4777	parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4778	parsed_pattern, cb);
4779
4780	/ Insert trailing items for word and line matching (features provided for the*
4781	benefit of pcre2grep). /*
4782
4783	if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != `0`)
4784	{
4785	*parsed_pattern++ = META_KET;
4786	*parsed_pattern++ = META_DOLLAR;
4787	}
4788	else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != `0`)
4789	{
4790	*parsed_pattern++ = META_KET;
4791	*parsed_pattern++ = META_ESCAPE + ESC_b;
4792	}
4793
4794	/ Terminate the parsed pattern, then return success if all groups are closed.*
4795	Otherwise we have unclosed parentheses. /*
4796
4797	if (parsed_pattern >= parsed_pattern_end)
4798	{
4799	errorcode = ERR63; / Internal error (parsed pattern overflow) /
4800	goto FAILED;
4801	}
4802
4803	*parsed_pattern = META_END;
4804	if (nest_depth == `0`) return `0`;
4805
4806	UNCLOSED_PARENTHESIS:
4807	errorcode = ERR14;
4808
4809	/ Come here for all failures. /
4810
4811	FAILED:
4812	cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4813	return errorcode;
4814
4815	/ Some errors need to indicate the previous character. /
4816
4817	FAILED_BACK:
4818	ptr--;
4819	goto FAILED;
4820
4821	/ This failure happens several times. /
4822
4823	BAD_VERSION_CONDITION:
4824	errorcode = ERR79;
4825	goto FAILED;
4826	}
4827
4828
4829
4830	/*************************************************
4831	* Find first significant opcode *
4832	*************************************************/
4833
4834	/ This is called by several functions that scan a compiled expression looking*
4835	for a fixed first character, or an anchoring opcode etc. It skips over things
4836	that do not influence this. For some calls, it makes sense to skip negative
4837	forward and all backward assertions, and also the \b assertion; for others it
4838	does not.
4839
4840	Arguments:
4841	code pointer to the start of the group
4842	skipassert TRUE if certain assertions are to be skipped
4843
4844	Returns: pointer to the first significant opcode
4845	*/
4846
4847	static const PCRE2_UCHAR*
4848	first_significant_code(PCRE2_SPTR code, BOOL skipassert)
4849	{
4850	for (;;)
4851	{
4852	switch ((int)*code)
4853	{
4854	case OP_ASSERT_NOT:
4855	case OP_ASSERTBACK:
4856	case OP_ASSERTBACK_NOT:
4857	case OP_ASSERTBACK_NA:
4858	if (!skipassert) return code;
4859	do code += GET(code, `1`); while (*code == OP_ALT);
4860	code += PRIV(OP_lengths)[*code];
4861	break;
4862
4863	case OP_WORD_BOUNDARY:
4864	case OP_NOT_WORD_BOUNDARY:
4865	if (!skipassert) return code;
4866	/ Fall through /
4867
4868	case OP_CALLOUT:
4869	case OP_CREF:
4870	case OP_DNCREF:
4871	case OP_RREF:
4872	case OP_DNRREF:
4873	case OP_FALSE:
4874	case OP_TRUE:
4875	code += PRIV(OP_lengths)[*code];
4876	break;
4877
4878	case OP_CALLOUT_STR:
4879	code += GET(code, `1` + `2`*LINK_SIZE);
4880	break;
4881
4882	case OP_SKIPZERO:
4883	code += `2` + GET(code, `2`) + LINK_SIZE;
4884	break;
4885
4886	case OP_COND:
4887	case OP_SCOND:
4888	if (code[`1`+LINK_SIZE] != OP_FALSE \|\| / Not DEFINE /
4889	code[GET(code, `1`)] != OP_KET) / More than one branch /
4890	return code;
4891	code += GET(code, `1`) + `1` + LINK_SIZE;
4892	break;
4893
4894	case OP_MARK:
4895	case OP_COMMIT_ARG:
4896	case OP_PRUNE_ARG:
4897	case OP_SKIP_ARG:
4898	case OP_THEN_ARG:
4899	code += code[`1`] + PRIV(OP_lengths)[*code];
4900	break;
4901
4902	default:
4903	return code;
4904	}
4905	}
4906	/ Control never reaches here /
4907	}
4908
4909
4910
4911	#ifdef SUPPORT_UNICODE
4912	/*************************************************
4913	* Get othercase range *
4914	*************************************************/
4915
4916	/ This function is passed the start and end of a class range in UCP mode. It*
4917	searches up the characters, looking for ranges of characters in the "other"
4918	case. Each call returns the next one, updating the start address. A character
4919	with multiple other cases is returned on its own with a special return value.
4920
4921	Arguments:
4922	cptr points to starting character value; updated
4923	d end value
4924	ocptr where to put start of othercase range
4925	odptr where to put end of othercase range
4926
4927	Yield: -1 when no more
4928	0 when a range is returned
4929	>0 the CASESET offset for char with multiple other cases
4930	in this case, ocptr contains the original
4931	*/
4932
4933	static int
4934	get_othercase_range(uint32_t cptr, uint32_t d, uint32_t ocptr,
4935	uint32_t *odptr)
4936	{
4937	uint32_t c, othercase, next;
4938	unsigned int co;
4939
4940	/ Find the first character that has an other case. If it has multiple other*
4941	cases, return its case offset value. /*
4942
4943	for (c = *cptr; c <= d; c++)
4944	{
4945	if ((co = UCD_CASESET(c)) != `0`)
4946	{
4947	ocptr = c++; /* Character that has the set /
4948	cptr = c; /* Rest of input range /
4949	return (int)co;
4950	}
4951	if ((othercase = UCD_OTHERCASE(c)) != c) break;
4952	}
4953
4954	if (c > d) return -`1`; / Reached end of range /
4955
4956	/ Found a character that has a single other case. Search for the end of the*
4957	range, which is either the end of the input range, or a character that has zero
4958	or more than one other cases. /*
4959
4960	*ocptr = othercase;
4961	next = othercase + `1`;
4962
4963	for (++c; c <= d; c++)
4964	{
4965	if ((co = UCD_CASESET(c)) != `0` \|\| UCD_OTHERCASE(c) != next) break;
4966	next++;
4967	}
4968
4969	odptr = next - `1`; /* End of othercase range /
4970	cptr = c; /* Rest of input range /
4971	return `0`;
4972	}
4973	#endif /* SUPPORT_UNICODE */
4974
4975
4976
4977	/*************************************************
4978	* Add a character or range to a class (internal) *
4979	*************************************************/
4980
4981	/ This function packages up the logic of adding a character or range of*
4982	characters to a class. The character values in the arguments will be within the
4983	valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4984	called only from within the "add to class" group of functions, some of which
4985	are recursive and mutually recursive. The external entry point is
4986	add_to_class().
4987
4988	Arguments:
4989	classbits the bit map for characters < 256
4990	uchardptr points to the pointer for extra data
4991	options the options word
4992	cb compile data
4993	start start of range character
4994	end end of range character
4995
4996	Returns: the number of < 256 characters added
4997	the pointer to extra data is updated
4998	*/
4999
5000	static unsigned int
5001	add_to_class_internal(uint8_t classbits, PCRE2_UCHAR *uchardptr,
5002	uint32_t options, compile_block *cb, uint32_t start, uint32_t end)
5003	{
5004	uint32_t c;
5005	uint32_t classbits_end = (end <= `0xff` ? end : `0xff`);
5006	unsigned int n8 = `0`;
5007
5008	/ If caseless matching is required, scan the range and process alternate*
5009	cases. In Unicode, there are 8-bit characters that have alternate cases that
5010	are greater than 255 and vice-versa. Sometimes we can just extend the original
5011	range. /*
5012
5013	if ((options & PCRE2_CASELESS) != `0`)
5014	{
5015	#ifdef SUPPORT_UNICODE
5016	if ((options & (PCRE2_UTF\|PCRE2_UCP)) != `0`)
5017	{
5018	int rc;
5019	uint32_t oc, od;
5020
5021	options &= ~PCRE2_CASELESS; / Remove for recursive calls /
5022	c = start;
5023
5024	while ((rc = get_othercase_range(&c, end, &oc, &od)) >= `0`)
5025	{
5026	/ Handle a single character that has more than one other case. /
5027
5028	if (rc > `0`) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb,
5029	PRIV(ucd_caseless_sets) + rc, oc);
5030
5031	/ Do nothing if the other case range is within the original range. /
5032
5033	else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue;
5034
5035	/ Extend the original range if there is overlap, noting that if oc < c, we*
5036	can't have od > end because a subrange is always shorter than the basic
5037	range. Otherwise, use a recursive call to add the additional range. /*
5038
5039	else if (oc < start && od >= start - `1`) start = oc; / Extend downwards /
5040	else if (od > end && oc <= end + `1`)
5041	{
5042	end = od; / Extend upwards /
5043	if (end > classbits_end) classbits_end = (end <= `0xff` ? end : `0xff`);
5044	}
5045	else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od);
5046	}
5047	}
5048	else
5049	#endif /* SUPPORT_UNICODE */
5050
5051	/ Not UTF mode /
5052
5053	for (c = start; c <= classbits_end; c++)
5054	{
5055	SETBIT(classbits, cb->fcc[c]);
5056	n8++;
5057	}
5058	}
5059
5060	/ Now handle the originally supplied range. Adjust the final value according*
5061	to the bit length - this means that the same lists of (e.g.) horizontal spaces
5062	can be used in all cases. /*
5063
5064	if ((options & PCRE2_UTF) == `0` && end > MAX_NON_UTF_CHAR)
5065	end = MAX_NON_UTF_CHAR;
5066
5067	if (start > cb->class_range_start && end < cb->class_range_end) return n8;
5068
5069	/ Use the bitmap for characters < 256. Otherwise use extra data./
5070
5071	for (c = start; c <= classbits_end; c++)
5072	{
5073	/ Regardless of start, c will always be <= 255. /
5074	SETBIT(classbits, c);
5075	n8++;
5076	}
5077
5078	#ifdef SUPPORT_WIDE_CHARS
5079	if (start <= `0xff`) start = `0xff` + `1`;
5080
5081	if (end >= start)
5082	{
5083	PCRE2_UCHAR uchardata = uchardptr;
5084
5085	#ifdef SUPPORT_UNICODE
5086	if ((options & PCRE2_UTF) != `0`)
5087	{
5088	if (start < end)
5089	{
5090	*uchardata++ = XCL_RANGE;
5091	uchardata += PRIV(ord2utf)(start, uchardata);
5092	uchardata += PRIV(ord2utf)(end, uchardata);
5093	}
5094	else if (start == end)
5095	{
5096	*uchardata++ = XCL_SINGLE;
5097	uchardata += PRIV(ord2utf)(start, uchardata);
5098	}
5099	}
5100	else
5101	#endif /* SUPPORT_UNICODE */
5102
5103	/ Without UTF support, character values are constrained by the bit length,*
5104	and can only be > 256 for 16-bit and 32-bit libraries. /*
5105
5106	#if PCRE2_CODE_UNIT_WIDTH == 8
5107	{}
5108	#else
5109	if (start < end)
5110	{
5111	*uchardata++ = XCL_RANGE;
5112	*uchardata++ = start;
5113	*uchardata++ = end;
5114	}
5115	else if (start == end)
5116	{
5117	*uchardata++ = XCL_SINGLE;
5118	*uchardata++ = start;
5119	}
5120	#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
5121	uchardptr = uchardata; /* Updata extra data pointer /
5122	}
5123	#else /* SUPPORT_WIDE_CHARS */
5124	(void)uchardptr; / Avoid compiler warning /
5125	#endif /* SUPPORT_WIDE_CHARS */
5126
5127	return n8; / Number of 8-bit characters /
5128	}
5129
5130
5131
5132	#ifdef SUPPORT_UNICODE
5133	/*************************************************
5134	* Add a list of characters to a class (internal) *
5135	*************************************************/
5136
5137	/ This function is used for adding a list of case-equivalent characters to a*
5138	class when in UTF mode. This function is called only from within
5139	add_to_class_internal(), with which it is mutually recursive.
5140
5141	Arguments:
5142	classbits the bit map for characters < 256
5143	uchardptr points to the pointer for extra data
5144	options the options word
5145	cb contains pointers to tables etc.
5146	p points to row of 32-bit values, terminated by NOTACHAR
5147	except character to omit; this is used when adding lists of
5148	case-equivalent characters to avoid including the one we
5149	already know about
5150
5151	Returns: the number of < 256 characters added
5152	the pointer to extra data is updated
5153	*/
5154
5155	static unsigned int
5156	add_list_to_class_internal(uint8_t classbits, PCRE2_UCHAR *uchardptr,
5157	uint32_t options, compile_block cb, const* uint32_t p, unsigned* int except)
5158	{
5159	unsigned int n8 = `0`;
5160	while (p[`0`] < NOTACHAR)
5161	{
5162	unsigned int n = `0`;
5163	if (p[`0`] != except)
5164	{
5165	while(p[n+`1`] == p[`0`] + n + `1`) n++;
5166	n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[`0`], p[n]);
5167	}
5168	p += n + `1`;
5169	}
5170	return n8;
5171	}
5172	#endif
5173
5174
5175
5176	/*************************************************
5177	* External entry point for add range to class *
5178	*************************************************/
5179
5180	/ This function sets the overall range so that the internal functions can try*
5181	to avoid duplication when handling case-independence.
5182
5183	Arguments:
5184	classbits the bit map for characters < 256
5185	uchardptr points to the pointer for extra data
5186	options the options word
5187	cb compile data
5188	start start of range character
5189	end end of range character
5190
5191	Returns: the number of < 256 characters added
5192	the pointer to extra data is updated
5193	*/
5194
5195	static unsigned int
5196	add_to_class(uint8_t classbits, PCRE2_UCHAR *uchardptr, uint32_t options,
5197	compile_block *cb, uint32_t start, uint32_t end)
5198	{
5199	cb->class_range_start = start;
5200	cb->class_range_end = end;
5201	return add_to_class_internal(classbits, uchardptr, options, cb, start, end);
5202	}
5203
5204
5205	/*************************************************
5206	* External entry point for add list to class *
5207	*************************************************/
5208
5209	/ This function is used for adding a list of horizontal or vertical whitespace*
5210	characters to a class. The list must be in order so that ranges of characters
5211	can be detected and handled appropriately. This function sets the overall range
5212	so that the internal functions can try to avoid duplication when handling
5213	case-independence.
5214
5215	Arguments:
5216	classbits the bit map for characters < 256
5217	uchardptr points to the pointer for extra data
5218	options the options word
5219	cb contains pointers to tables etc.
5220	p points to row of 32-bit values, terminated by NOTACHAR
5221	except character to omit; this is used when adding lists of
5222	case-equivalent characters to avoid including the one we
5223	already know about
5224
5225	Returns: the number of < 256 characters added
5226	the pointer to extra data is updated
5227	*/
5228
5229	static unsigned int
5230	add_list_to_class(uint8_t classbits, PCRE2_UCHAR *uchardptr, uint32_t options,
5231	compile_block cb, const* uint32_t p, unsigned* int except)
5232	{
5233	unsigned int n8 = `0`;
5234	while (p[`0`] < NOTACHAR)
5235	{
5236	unsigned int n = `0`;
5237	if (p[`0`] != except)
5238	{
5239	while(p[n+`1`] == p[`0`] + n + `1`) n++;
5240	cb->class_range_start = p[`0`];
5241	cb->class_range_end = p[n];
5242	n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[`0`], p[n]);
5243	}
5244	p += n + `1`;
5245	}
5246	return n8;
5247	}
5248
5249
5250
5251	/*************************************************
5252	* Add characters not in a list to a class *
5253	*************************************************/
5254
5255	/ This function is used for adding the complement of a list of horizontal or*
5256	vertical whitespace to a class. The list must be in order.
5257
5258	Arguments:
5259	classbits the bit map for characters < 256
5260	uchardptr points to the pointer for extra data
5261	options the options word
5262	cb contains pointers to tables etc.
5263	p points to row of 32-bit values, terminated by NOTACHAR
5264
5265	Returns: the number of < 256 characters added
5266	the pointer to extra data is updated
5267	*/
5268
5269	static unsigned int
5270	add_not_list_to_class(uint8_t classbits, PCRE2_UCHAR *uchardptr,
5271	uint32_t options, compile_block cb, const* uint32_t *p)
5272	{
5273	BOOL utf = (options & PCRE2_UTF) != `0`;
5274	unsigned int n8 = `0`;
5275	if (p[`0`] > `0`)
5276	n8 += add_to_class(classbits, uchardptr, options, cb, `0`, p[`0`] - `1`);
5277	while (p[`0`] < NOTACHAR)
5278	{
5279	while (p[`1`] == p[`0`] + `1`) p++;
5280	n8 += add_to_class(classbits, uchardptr, options, cb, p[`0`] + `1`,
5281	(p[`1`] == NOTACHAR) ? (utf ? `0x10ffffu` : `0xffffffffu`) : p[`1`] - `1`);
5282	p++;
5283	}
5284	return n8;
5285	}
5286
5287
5288
5289	/*************************************************
5290	* Find details of duplicate group names *
5291	*************************************************/
5292
5293	/ This is called from compile_branch() when it needs to know the index and*
5294	count of duplicates in the names table when processing named backreferences,
5295	either directly, or as conditions.
5296
5297	Arguments:
5298	name points to the name
5299	length the length of the name
5300	indexptr where to put the index
5301	countptr where to put the count of duplicates
5302	errorcodeptr where to put an error code
5303	cb the compile block
5304
5305	Returns: TRUE if OK, FALSE if not, error code set
5306	*/
5307
5308	static BOOL
5309	find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5310	int countptr, int* errorcodeptr, compile_block cb)
5311	{
5312	uint32_t i, groupnumber;
5313	int count;
5314	PCRE2_UCHAR *slot = cb->name_table;
5315
5316	/ Find the first entry in the table /
5317
5318	for (i = `0`; i < cb->names_found; i++)
5319	{
5320	if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == `0` &&
5321	slot[IMM2_SIZE+length] == `0`) break;
5322	slot += cb->name_entry_size;
5323	}
5324
5325	/ This should not occur, because this function is called only when we know we*
5326	have duplicate names. Give an internal error. /*
5327
5328	if (i >= cb->names_found)
5329	{
5330	*errorcodeptr = ERR53;
5331	cb->erroroffset = name - cb->start_pattern;
5332	return FALSE;
5333	}
5334
5335	/ Record the index and then see how many duplicates there are, updating the*
5336	backref map and maximum back reference as we do. /*
5337
5338	*indexptr = i;
5339	count = `0`;
5340
5341	for (;;)
5342	{
5343	count++;
5344	groupnumber = GET2(slot,`0`);
5345	cb->backref_map \|= (groupnumber < `32`)? (`1u` << groupnumber) : `1`;
5346	if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5347	if (++i >= cb->names_found) break;
5348	slot += cb->name_entry_size;
5349	if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != `0` \|\|
5350	(slot+IMM2_SIZE)[length] != `0`) break;
5351	}
5352
5353	*countptr = count;
5354	return TRUE;
5355	}
5356
5357
5358
5359	/*************************************************
5360	* Compile one branch *
5361	*************************************************/
5362
5363	/ Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If*
5364	the options are changed during the branch, the pointer is used to change the
5365	external options bits. This function is used during the pre-compile phase when
5366	we are trying to find out the amount of memory needed, as well as during the
5367	real compile phase. The value of lengthptr distinguishes the two phases.
5368
5369	Arguments:
5370	optionsptr pointer to the option bits
5371	codeptr points to the pointer to the current code point
5372	pptrptr points to the current parsed pattern pointer
5373	errorcodeptr points to error code variable
5374	firstcuptr place to put the first required code unit
5375	firstcuflagsptr place to put the first code unit flags
5376	reqcuptr place to put the last required code unit
5377	reqcuflagsptr place to put the last required code unit flags
5378	bcptr points to current branch chain
5379	cb contains pointers to tables etc.
5380	lengthptr NULL during the real compile phase
5381	points to length accumulator during pre-compile phase
5382
5383	Returns: 0 There's been an error, errorcodeptr is non-zero*
5384	+1 Success, this branch must match at least one character
5385	-1 Success, this branch may match an empty string
5386	*/
5387
5388	static int
5389	compile_branch(uint32_t optionsptr, PCRE2_UCHAR codeptr, uint32_t *pptrptr,
5390	int errorcodeptr, uint32_t firstcuptr, uint32_t *firstcuflagsptr,
5391	uint32_t reqcuptr, uint32_t reqcuflagsptr, branch_chain *bcptr,
5392	compile_block cb, PCRE2_SIZE lengthptr)
5393	{
5394	int bravalue = `0`;
5395	int okreturn = -`1`;
5396	int group_return = `0`;
5397	uint32_t repeat_min = `0`, repeat_max = `0`; / To please picky compilers /
5398	uint32_t greedy_default, greedy_non_default;
5399	uint32_t repeat_type, op_type;
5400	uint32_t options = optionsptr; /* May change dynamically /
5401	uint32_t firstcu, reqcu;
5402	uint32_t zeroreqcu, zerofirstcu;
5403	uint32_t escape;
5404	uint32_t pptr = pptrptr;
5405	uint32_t meta, meta_arg;
5406	uint32_t firstcuflags, reqcuflags;
5407	uint32_t zeroreqcuflags, zerofirstcuflags;
5408	uint32_t req_caseopt, reqvary, tempreqvary;
5409	PCRE2_SIZE offset = `0`;
5410	PCRE2_SIZE length_prevgroup = `0`;
5411	PCRE2_UCHAR code = codeptr;
5412	PCRE2_UCHAR *last_code = code;
5413	PCRE2_UCHAR *orig_code = code;
5414	PCRE2_UCHAR *tempcode;
5415	PCRE2_UCHAR *previous = NULL;
5416	PCRE2_UCHAR op_previous;
5417	BOOL groupsetfirstcu = FALSE;
5418	BOOL had_accept = FALSE;
5419	BOOL matched_char = FALSE;
5420	BOOL previous_matched_char = FALSE;
5421	BOOL reset_caseful = FALSE;
5422	const uint8_t *cbits = cb->cbits;
5423	uint8_t classbits[`32`];
5424
5425	/ We can fish out the UTF setting once and for all into a BOOL, but we must*
5426	not do this for other options (e.g. PCRE2_EXTENDED) because they may change
5427	dynamically as we process the pattern. /*
5428
5429	#ifdef SUPPORT_UNICODE
5430	BOOL utf = (options & PCRE2_UTF) != `0`;
5431	BOOL ucp = (options & PCRE2_UCP) != `0`;
5432	#else /* No Unicode support */
5433	BOOL utf = FALSE;
5434	#endif
5435
5436	/ Helper variables for OP_XCLASS opcode (for characters > 255). We define*
5437	class_uchardata always so that it can be passed to add_to_class() always,
5438	though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5439	alternative calls for the different cases. /*
5440
5441	PCRE2_UCHAR *class_uchardata;
5442	#ifdef SUPPORT_WIDE_CHARS
5443	BOOL xclass;
5444	PCRE2_UCHAR *class_uchardata_base;
5445	#endif
5446
5447	/ Set up the default and non-default settings for greediness /
5448
5449	greedy_default = ((options & PCRE2_UNGREEDY) != `0`);
5450	greedy_non_default = greedy_default ^ `1`;
5451
5452	/ Initialize no first unit, no required unit. REQ_UNSET means "no char*
5453	matching encountered yet". It gets changed to REQ_NONE if we hit something that
5454	matches a non-fixed first unit; reqcu just remains unset if we never find one.
5455
5456	When we hit a repeat whose minimum is zero, we may have to adjust these values
5457	to take the zero repeat into account. This is implemented by setting them to
5458	zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5459	item types that can be repeated set these backoff variables appropriately. /*
5460
5461	firstcu = reqcu = zerofirstcu = zeroreqcu = `0`;
5462	firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5463
5464	/ The variable req_caseopt contains either the REQ_CASELESS bit or zero,*
5465	according to the current setting of the caseless flag. The REQ_CASELESS value
5466	leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5467	to record the case status of the value. This is used only for ASCII characters.
5468	*/
5469
5470	req_caseopt = ((options & PCRE2_CASELESS) != `0`)? REQ_CASELESS : `0`;
5471
5472	/ Switch on next META item until the end of the branch /
5473
5474	for (;; pptr++)
5475	{
5476	#ifdef SUPPORT_WIDE_CHARS
5477	BOOL xclass_has_prop;
5478	#endif
5479	BOOL negate_class;
5480	BOOL should_flip_negation;
5481	BOOL match_all_or_no_wide_chars;
5482	BOOL possessive_quantifier;
5483	BOOL note_group_empty;
5484	int class_has_8bitchar;
5485	uint32_t mclength;
5486	uint32_t skipunits;
5487	uint32_t subreqcu, subfirstcu;
5488	uint32_t groupnumber;
5489	uint32_t verbarglen, verbculen;
5490	uint32_t subreqcuflags, subfirstcuflags;
5491	open_capitem *oc;
5492	PCRE2_UCHAR mcbuffer[`8`];
5493
5494	/ Get next META item in the pattern and its potential argument. /
5495
5496	meta = META_CODE(*pptr);
5497	meta_arg = META_DATA(*pptr);
5498
5499	/ If we are in the pre-compile phase, accumulate the length used for the*
5500	previous cycle of this loop, unless the next item is a quantifier. /*
5501
5502	if (lengthptr != NULL)
5503	{
5504	if (code > cb->start_workspace + cb->workspace_size -
5505	WORK_SIZE_SAFETY_MARGIN) / Check for overrun /
5506	{
5507	*errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5508	ERR52 : ERR86;
5509	return `0`;
5510	}
5511
5512	/ There is at least one situation where code goes backwards: this is the*
5513	case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5514	is processed, the whole class is eliminated. However, it is created first,
5515	so we have to allow memory for it. Therefore, don't ever reduce the length
5516	at this point. /*
5517
5518	if (code < last_code) code = last_code;
5519
5520	/ If the next thing is not a quantifier, we add the length of the previous*
5521	item into the total, and reset the code pointer to the start of the
5522	workspace. Otherwise leave the previous item available to be quantified. /*
5523
5524	if (meta < META_ASTERISK \|\| meta > META_MINMAX_QUERY)
5525	{
5526	if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5527	{
5528	errorcodeptr = ERR20; /* Integer overflow /
5529	return `0`;
5530	}
5531	*lengthptr += (PCRE2_SIZE)(code - orig_code);
5532	if (*lengthptr > MAX_PATTERN_SIZE)
5533	{
5534	errorcodeptr = ERR20; /* Pattern is too large /
5535	return `0`;
5536	}
5537	code = orig_code;
5538	}
5539
5540	/ Remember where this code item starts so we can catch the "backwards"*
5541	case above next time round. /*
5542
5543	last_code = code;
5544	}
5545
5546	/ Process the next parsed pattern item. If it is not a quantifier, remember*
5547	where it starts so that it can be quantified when a quantifier follows.
5548	Checking for the legality of quantifiers happens in parse_regex(), except for
5549	a quantifier after an assertion that is a condition. /*
5550
5551	if (meta < META_ASTERISK \|\| meta > META_MINMAX_QUERY)
5552	{
5553	previous = code;
5554	if (matched_char && !had_accept) okreturn = `1`;
5555	}
5556
5557	previous_matched_char = matched_char;
5558	matched_char = FALSE;
5559	note_group_empty = FALSE;
5560	skipunits = `0`; / Default value for most subgroups /
5561
5562	switch(meta)
5563	{
5564	/ ===================================================================/
5565	/ The branch terminates at pattern end or \| or ) /
5566
5567	case META_END:
5568	case META_ALT:
5569	case META_KET:
5570	*firstcuptr = firstcu;
5571	*firstcuflagsptr = firstcuflags;
5572	*reqcuptr = reqcu;
5573	*reqcuflagsptr = reqcuflags;
5574	*codeptr = code;
5575	*pptrptr = pptr;
5576	return okreturn;
5577
5578
5579	/ ===================================================================/
5580	/ Handle single-character metacharacters. In multiline mode, ^ disables*
5581	the setting of any following char as a first character. /*
5582
5583	case META_CIRCUMFLEX:
5584	if ((options & PCRE2_MULTILINE) != `0`)
5585	{
5586	if (firstcuflags == REQ_UNSET)
5587	zerofirstcuflags = firstcuflags = REQ_NONE;
5588	*code++ = OP_CIRCM;
5589	}
5590	else *code++ = OP_CIRC;
5591	break;
5592
5593	case META_DOLLAR:
5594	*code++ = ((options & PCRE2_MULTILINE) != `0`)? OP_DOLLM : OP_DOLL;
5595	break;
5596
5597	/ There can never be a first char if '.' is first, whatever happens about*
5598	repeats. The value of reqcu doesn't change either. /*
5599
5600	case META_DOT:
5601	matched_char = TRUE;
5602	if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5603	zerofirstcu = firstcu;
5604	zerofirstcuflags = firstcuflags;
5605	zeroreqcu = reqcu;
5606	zeroreqcuflags = reqcuflags;
5607	*code++ = ((options & PCRE2_DOTALL) != `0`)? OP_ALLANY: OP_ANY;
5608	break;
5609
5610
5611	/ ===================================================================/
5612	/ Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.*
5613	Otherwise, an initial ']' is taken as a data character. When empty classes
5614	are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5615	match any character, so generate OP_ALLANY. /*
5616
5617	case META_CLASS_EMPTY:
5618	case META_CLASS_EMPTY_NOT:
5619	matched_char = TRUE;
5620	*code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5621	if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5622	zerofirstcu = firstcu;
5623	zerofirstcuflags = firstcuflags;
5624	break;
5625
5626
5627	/ ===================================================================/
5628	/ Non-empty character class. If the included characters are all < 256, we*
5629	build a 32-byte bitmap of the permitted characters, except in the special
5630	case where there is only one such character. For negated classes, we build
5631	the map as usual, then invert it at the end. However, we use a different
5632	opcode so that data characters > 255 can be handled correctly.
5633
5634	If the class contains characters outside the 0-255 range, a different
5635	opcode is compiled. It may optionally have a bit map for characters < 256,
5636	but those above are are explicitly listed afterwards. A flag code unit
5637	tells whether the bitmap is present, and whether this is a negated class or
5638	not. /*
5639
5640	case META_CLASS_NOT:
5641	case META_CLASS:
5642	matched_char = TRUE;
5643	negate_class = meta == META_CLASS_NOT;
5644
5645	/ We can optimize the case of a single character in a class by generating*
5646	OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5647	negative. In the negative case there can be no first char if this item is
5648	first, whatever repeat count may follow. In the case of reqcu, save the
5649	previous value for reinstating. /*
5650
5651	/ NOTE: at present this optimization is not effective if the only*
5652	character in a class in 32-bit, non-UCP mode has its top bit set. /*
5653
5654	if (pptr[`1`] < META_END && pptr[`2`] == META_CLASS_END)
5655	{
5656	#ifdef SUPPORT_UNICODE
5657	uint32_t d;
5658	#endif
5659	uint32_t c = pptr[`1`];
5660
5661	pptr += `2`; / Move on to class end /
5662	if (meta == META_CLASS) / A positive one-char class can be /
5663	{ / handled as a normal literal character. /
5664	meta = c; / Set up the character /
5665	goto NORMAL_CHAR_SET;
5666	}
5667
5668	/ Handle a negative one-character class /
5669
5670	zeroreqcu = reqcu;
5671	zeroreqcuflags = reqcuflags;
5672	if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5673	zerofirstcu = firstcu;
5674	zerofirstcuflags = firstcuflags;
5675
5676	/ For caseless UTF or UCP mode, check whether this character has more*
5677	than one other case. If so, generate a special OP_NOTPROP item instead of
5678	OP_NOTI. /*
5679
5680	#ifdef SUPPORT_UNICODE
5681	if ((utf\|\|ucp) && (options & PCRE2_CASELESS) != `0` &&
5682	(d = UCD_CASESET(c)) != `0`)
5683	{
5684	*code++ = OP_NOTPROP;
5685	*code++ = PT_CLIST;
5686	*code++ = d;
5687	break; / We are finished with this class /
5688	}
5689	#endif
5690	/ Char has only one other case, or UCP not available /
5691
5692	*code++ = ((options & PCRE2_CASELESS) != `0`)? OP_NOTI: OP_NOT;
5693	code += PUTCHAR(c, code);
5694	break; / We are finished with this class /
5695	} / End of 1-char optimization /
5696
5697	/ Handle character classes that contain more than just one literal*
5698	character. If there are exactly two characters in a positive class, see if
5699	they are case partners. This can be optimized to generate a caseless single
5700	character match (which also sets first/required code units if relevant). /*
5701
5702	if (meta == META_CLASS && pptr[`1`] < META_END && pptr[`2`] < META_END &&
5703	pptr[`3`] == META_CLASS_END)
5704	{
5705	uint32_t c = pptr[`1`];
5706
5707	#ifdef SUPPORT_UNICODE
5708	if (UCD_CASESET(c) == `0`)
5709	#endif
5710	{
5711	uint32_t d;
5712
5713	#ifdef SUPPORT_UNICODE
5714	if ((utf \|\| ucp) && c > `127`) d = UCD_OTHERCASE(c); else
5715	#endif
5716	{
5717	#if PCRE2_CODE_UNIT_WIDTH != 8
5718	if (c > `255`) d = c; else
5719	#endif
5720	d = TABLE_GET(c, cb->fcc, c);
5721	}
5722
5723	if (c != d && pptr[`2`] == d)
5724	{
5725	pptr += `3`; / Move on to class end /
5726	meta = c;
5727	if ((options & PCRE2_CASELESS) == `0`)
5728	{
5729	reset_caseful = TRUE;
5730	options \|= PCRE2_CASELESS;
5731	req_caseopt = REQ_CASELESS;
5732	}
5733	goto CLASS_CASELESS_CHAR;
5734	}
5735	}
5736	}
5737
5738	/ If a non-extended class contains a negative special such as \S, we need*
5739	to flip the negation flag at the end, so that support for characters > 255
5740	works correctly (they are all included in the class). An extended class may
5741	need to insert specific matching or non-matching code for wide characters.
5742	*/
5743
5744	should_flip_negation = match_all_or_no_wide_chars = FALSE;
5745
5746	/ Extended class (xclass) will be used when characters > 255*
5747	might match. /*
5748
5749	#ifdef SUPPORT_WIDE_CHARS
5750	xclass = FALSE;
5751	class_uchardata = code + LINK_SIZE + `2`; / For XCLASS items /
5752	class_uchardata_base = class_uchardata; / Save the start /
5753	#endif
5754
5755	/ For optimization purposes, we track some properties of the class:*
5756	class_has_8bitchar will be non-zero if the class contains at least one
5757	character with a code point less than 256; xclass_has_prop will be TRUE if
5758	Unicode property checks are present in the class. /*
5759
5760	class_has_8bitchar = `0`;
5761	#ifdef SUPPORT_WIDE_CHARS
5762	xclass_has_prop = FALSE;
5763	#endif
5764
5765	/ Initialize the 256-bit (32-byte) bit map to all zeros. We build the map*
5766	in a temporary bit of memory, in case the class contains fewer than two
5767	8-bit characters because in that case the compiled code doesn't use the bit
5768	map. /*
5769
5770	memset(classbits, `0`, `32` * sizeof(uint8_t));
5771
5772	/ Process items until META_CLASS_END is reached. /
5773
5774	while ((meta = *(++pptr)) != META_CLASS_END)
5775	{
5776	/ Handle POSIX classes such as [:alpha:] etc. /
5777
5778	if (meta == META_POSIX \|\| meta == META_POSIX_NEG)
5779	{
5780	BOOL local_negate = (meta == META_POSIX_NEG);
5781	int posix_class = *(++pptr);
5782	int taboffset, tabopt;
5783	uint8_t pbits[`32`];
5784
5785	should_flip_negation = local_negate; / Note negative special /
5786
5787	/ If matching is caseless, upper and lower are converted to alpha.*
5788	This relies on the fact that the class table starts with alpha,
5789	lower, upper as the first 3 entries. /*
5790
5791	if ((options & PCRE2_CASELESS) != `0` && posix_class <= `2`)
5792	posix_class = `0`;
5793
5794	/ When PCRE2_UCP is set, some of the POSIX classes are converted to*
5795	different escape sequences that use Unicode properties \p or \P.
5796	Others that are not available via \p or \P have to generate
5797	XCL_PROP/XCL_NOTPROP directly, which is done here. /*
5798
5799	#ifdef SUPPORT_UNICODE
5800	if ((options & PCRE2_UCP) != `0`) switch(posix_class)
5801	{
5802	case PC_GRAPH:
5803	case PC_PRINT:
5804	case PC_PUNCT:
5805	*class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5806	*class_uchardata++ = (PCRE2_UCHAR)
5807	((posix_class == PC_GRAPH)? PT_PXGRAPH :
5808	(posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
5809	*class_uchardata++ = `0`;
5810	xclass_has_prop = TRUE;
5811	goto CONTINUE_CLASS;
5812
5813	/ For the other POSIX classes (ascii, xdigit) we are going to*
5814	fall through to the non-UCP case and build a bit map for
5815	characters with code points less than 256. However, if we are in
5816	a negated POSIX class, characters with code points greater than
5817	255 must either all match or all not match, depending on whether
5818	the whole class is not or is negated. For example, for
5819	[[:^ascii:]... they must all match, whereas for [^[:^xdigit:]...
5820	they must not.
5821
5822	In the special case where there are no xclass items, this is
5823	automatically handled by the use of OP_CLASS or OP_NCLASS, but an
5824	explicit range is needed for OP_XCLASS. Setting a flag here
5825	causes the range to be generated later when it is known that
5826	OP_XCLASS is required. In the 8-bit library this is relevant only in
5827	utf mode, since no wide characters can exist otherwise. /*
5828
5829	default:
5830	#if PCRE2_CODE_UNIT_WIDTH == 8
5831	if (utf)
5832	#endif
5833	match_all_or_no_wide_chars \|= local_negate;
5834	break;
5835	}
5836	#endif /* SUPPORT_UNICODE */
5837
5838	/ In the non-UCP case, or when UCP makes no difference, we build the*
5839	bit map for the POSIX class in a chunk of local store because we may
5840	be adding and subtracting from it, and we don't want to subtract bits
5841	that may be in the main map already. At the end we or the result into
5842	the bit map that is being built. /*
5843
5844	posix_class *= `3`;
5845
5846	/ Copy in the first table (always present) /
5847
5848	memcpy(pbits, cbits + posix_class_maps[posix_class],
5849	`32` * sizeof(uint8_t));
5850
5851	/ If there is a second table, add or remove it as required. /
5852
5853	taboffset = posix_class_maps[posix_class + `1`];
5854	tabopt = posix_class_maps[posix_class + `2`];
5855
5856	if (taboffset >= `0`)
5857	{
5858	if (tabopt >= `0`)
5859	for (int i = `0`; i < `32`; i++) pbits[i] \|= cbits[(int)i + taboffset];
5860	else
5861	for (int i = `0`; i < `32`; i++) pbits[i] &= ~cbits[(int)i + taboffset];
5862	}
5863
5864	/ Now see if we need to remove any special characters. An option*
5865	value of 1 removes vertical space and 2 removes underscore. /*
5866
5867	if (tabopt < `0`) tabopt = -tabopt;
5868	if (tabopt == `1`) pbits[`1`] &= ~`0x3c`;
5869	else if (tabopt == `2`) pbits[`11`] &= `0x7f`;
5870
5871	/ Add the POSIX table or its complement into the main table that is*
5872	being built and we are done. /*
5873
5874	if (local_negate)
5875	for (int i = `0`; i < `32`; i++) classbits[i] \|= (uint8_t)(~pbits[i]);
5876	else
5877	for (int i = `0`; i < `32`; i++) classbits[i] \|= pbits[i];
5878
5879	/ Every class contains at least one < 256 character. /
5880
5881	class_has_8bitchar = `1`;
5882	goto CONTINUE_CLASS; / End of POSIX handling /
5883	}
5884
5885	/ Other than POSIX classes, the only items we should encounter are*
5886	\d-type escapes and literal characters (possibly as ranges). /*
5887
5888	if (meta == META_BIGVALUE)
5889	{
5890	meta = *(++pptr);
5891	goto CLASS_LITERAL;
5892	}
5893
5894	/ Any other non-literal must be an escape /
5895
5896	if (meta >= META_END)
5897	{
5898	if (META_CODE(meta) != META_ESCAPE)
5899	{
5900	#ifdef DEBUG_SHOW_PARSED
5901	fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
5902	"in character class\n", meta);
5903	#endif
5904	errorcodeptr = ERR89; /* Internal error - unrecognized. /
5905	return `0`;
5906	}
5907	escape = META_DATA(meta);
5908
5909	/ Every class contains at least one < 256 character. /
5910
5911	class_has_8bitchar++;
5912
5913	switch(escape)
5914	{
5915	case ESC_d:
5916	for (int i = `0`; i < `32`; i++) classbits[i] \|= cbits[i+cbit_digit];
5917	break;
5918
5919	case ESC_D:
5920	should_flip_negation = TRUE;
5921	for (int i = `0`; i < `32`; i++)
5922	classbits[i] \|= (uint8_t)(~cbits[i+cbit_digit]);
5923	break;
5924
5925	case ESC_w:
5926	for (int i = `0`; i < `32`; i++) classbits[i] \|= cbits[i+cbit_word];
5927	break;
5928
5929	case ESC_W:
5930	should_flip_negation = TRUE;
5931	for (int i = `0`; i < `32`; i++)
5932	classbits[i] \|= (uint8_t)(~cbits[i+cbit_word]);
5933	break;
5934
5935	/ Perl 5.004 onwards omitted VT from \s, but restored it at Perl*
5936	5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5937	previously set by something earlier in the character class.
5938	Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5939	we could just adjust the appropriate bit. From PCRE 8.34 we no
5940	longer treat \s and \S specially. /*
5941
5942	case ESC_s:
5943	for (int i = `0`; i < `32`; i++) classbits[i] \|= cbits[i+cbit_space];
5944	break;
5945
5946	case ESC_S:
5947	should_flip_negation = TRUE;
5948	for (int i = `0`; i < `32`; i++)
5949	classbits[i] \|= (uint8_t)(~cbits[i+cbit_space]);
5950	break;
5951
5952	/ When adding the horizontal or vertical space lists to a class, or*
5953	their complements, disable PCRE2_CASELESS, because it justs wastes
5954	time, and in the "not-x" UTF cases can create unwanted duplicates in
5955	the XCLASS list (provoked by characters that have more than one other
5956	case and by both cases being in the same "not-x" sublist). /*
5957
5958	case ESC_h:
5959	(void)add_list_to_class(classbits, &class_uchardata,
5960	options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR);
5961	break;
5962
5963	case ESC_H:
5964	(void)add_not_list_to_class(classbits, &class_uchardata,
5965	options & ~PCRE2_CASELESS, cb, PRIV(hspace_list));
5966	break;
5967
5968	case ESC_v:
5969	(void)add_list_to_class(classbits, &class_uchardata,
5970	options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR);
5971	break;
5972
5973	case ESC_V:
5974	(void)add_not_list_to_class(classbits, &class_uchardata,
5975	options & ~PCRE2_CASELESS, cb, PRIV(vspace_list));
5976	break;
5977
5978	/ If Unicode is not supported, \P and \p are not allowed and are*
5979	faulted at parse time, so will never appear here. /*
5980
5981	#ifdef SUPPORT_UNICODE
5982	case ESC_p:
5983	case ESC_P:
5984	{
5985	uint32_t ptype = *(++pptr) >> `16`;
5986	uint32_t pdata = *pptr & `0xffff`;
5987	*class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
5988	*class_uchardata++ = ptype;
5989	*class_uchardata++ = pdata;
5990	xclass_has_prop = TRUE;
5991	class_has_8bitchar--; / Undo! /
5992	}
5993	break;
5994	#endif
5995	}
5996
5997	goto CONTINUE_CLASS;
5998	} / End handling \d-type escapes /
5999
6000	/ A literal character may be followed by a range meta. At parse time*
6001	there are checks for out-of-order characters, for ranges where the two
6002	characters are equal, and for hyphens that cannot indicate a range. At
6003	this point, therefore, no checking is needed. /*
6004
6005	else
6006	{
6007	uint32_t c, d;
6008
6009	CLASS_LITERAL:
6010	c = d = meta;
6011
6012	/ Remember if \r or \n were explicitly used /
6013
6014	if (c == CHAR_CR \|\| c == CHAR_NL) cb->external_flags \|= PCRE2_HASCRORLF;
6015
6016	/ Process a character range /
6017
6018	if (pptr[`1`] == META_RANGE_LITERAL \|\| pptr[`1`] == META_RANGE_ESCAPED)
6019	{
6020	#ifdef EBCDIC
6021	BOOL range_is_literal = (pptr[`1`] == META_RANGE_LITERAL);
6022	#endif
6023	pptr += `2`;
6024	d = *pptr;
6025	if (d == META_BIGVALUE) d = *(++pptr);
6026
6027	/ Remember an explicit \r or \n, and add the range to the class. /
6028
6029	if (d == CHAR_CR \|\| d == CHAR_NL) cb->external_flags \|= PCRE2_HASCRORLF;
6030
6031	/ In an EBCDIC environment, Perl treats alphabetic ranges specially*
6032	because there are holes in the encoding, and simply using the range
6033	A-Z (for example) would include the characters in the holes. This
6034	applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. /*
6035
6036	#ifdef EBCDIC
6037	if (range_is_literal &&
6038	(cb->ctypes[c] & ctype_letter) != `0` &&
6039	(cb->ctypes[d] & ctype_letter) != `0` &&
6040	(c <= CHAR_z) == (d <= CHAR_z))
6041	{
6042	uint32_t uc = (d <= CHAR_z)? `0` : `64`;
6043	uint32_t C = c - uc;
6044	uint32_t D = d - uc;
6045
6046	if (C <= CHAR_i)
6047	{
6048	class_has_8bitchar +=
6049	add_to_class(classbits, &class_uchardata, options, cb, C + uc,
6050	((D < CHAR_i)? D : CHAR_i) + uc);
6051	C = CHAR_j;
6052	}
6053
6054	if (C <= D && C <= CHAR_r)
6055	{
6056	class_has_8bitchar +=
6057	add_to_class(classbits, &class_uchardata, options, cb, C + uc,
6058	((D < CHAR_r)? D : CHAR_r) + uc);
6059	C = CHAR_s;
6060	}
6061
6062	if (C <= D)
6063	{
6064	class_has_8bitchar +=
6065	add_to_class(classbits, &class_uchardata, options, cb, C + uc,
6066	D + uc);
6067	}
6068	}
6069	else
6070	#endif
6071	/ Not an EBCDIC special range /
6072
6073	class_has_8bitchar +=
6074	add_to_class(classbits, &class_uchardata, options, cb, c, d);
6075	goto CONTINUE_CLASS; / Go get the next char in the class /
6076	} / End of range handling /
6077
6078
6079	/ Handle a single character. /
6080
6081	class_has_8bitchar +=
6082	add_to_class(classbits, &class_uchardata, options, cb, meta, meta);
6083	}
6084
6085	/ Continue to the next item in the class. /
6086
6087	CONTINUE_CLASS:
6088
6089	#ifdef SUPPORT_WIDE_CHARS
6090	/ If any wide characters or Unicode properties have been encountered,*
6091	set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
6092	of the extra data and reset the pointer. This is so that very large
6093	classes that contain a zillion wide characters or Unicode property tests
6094	do not overwrite the workspace (which is on the stack). /*
6095
6096	if (class_uchardata > class_uchardata_base)
6097	{
6098	xclass = TRUE;
6099	if (lengthptr != NULL)
6100	{
6101	*lengthptr += class_uchardata - class_uchardata_base;
6102	class_uchardata = class_uchardata_base;
6103	}
6104	}
6105	#endif
6106
6107	continue; / Needed to avoid error when not supporting wide chars /
6108	} / End of main class-processing loop /
6109
6110	/ If this class is the first thing in the branch, there can be no first*
6111	char setting, whatever the repeat count. Any reqcu setting must remain
6112	unchanged after any kind of repeat. /*
6113
6114	if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6115	zerofirstcu = firstcu;
6116	zerofirstcuflags = firstcuflags;
6117	zeroreqcu = reqcu;
6118	zeroreqcuflags = reqcuflags;
6119
6120	/ If there are characters with values > 255, or Unicode property settings*
6121	(\p or \P), we have to compile an extended class, with its own opcode,
6122	unless there were no property settings and there was a negated special such
6123	as \S in the class, and PCRE2_UCP is not set, because in that case all
6124	characters > 255 are in or not in the class, so any that were explicitly
6125	given as well can be ignored.
6126
6127	In the UCP case, if certain negated POSIX classes ([:^ascii:] or
6128	[^:xdigit:]) were present in a class, we either have to match or not match
6129	all wide characters (depending on whether the whole class is or is not
6130	negated). This requirement is indicated by match_all_or_no_wide_chars being
6131	true. We do this by including an explicit range, which works in both cases.
6132	This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
6133	cannot be any wide characters in 8-bit non-UTF mode.
6134
6135	When there are* properties in a positive UTF-8 or any 16-bit or 32_bit*
6136	class where \S etc is present without PCRE2_UCP, causing an extended class
6137	to be compiled, we make sure that all characters > 255 are included by
6138	forcing match_all_or_no_wide_chars to be true.
6139
6140	If, when generating an xclass, there are no characters < 256, we can omit
6141	the bitmap in the actual compiled code. /*
6142
6143	#ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
6144	if (xclass && (
6145	#ifdef SUPPORT_UNICODE
6146	(options & PCRE2_UCP) != `0` \|\|
6147	#endif
6148	xclass_has_prop \|\| !should_flip_negation))
6149	{
6150	if (match_all_or_no_wide_chars \|\| (
6151	#if PCRE2_CODE_UNIT_WIDTH == 8
6152	utf &&
6153	#endif
6154	should_flip_negation && !negate_class && (options & PCRE2_UCP) == `0`))
6155	{
6156	*class_uchardata++ = XCL_RANGE;
6157	if (utf) / Will always be utf in the 8-bit library /
6158	{
6159	class_uchardata += PRIV(ord2utf)(`0x100`, class_uchardata);
6160	class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
6161	}
6162	else / Can only happen for the 16-bit & 32-bit libraries /
6163	{
6164	#if PCRE2_CODE_UNIT_WIDTH == 16
6165	*class_uchardata++ = `0x100`;
6166	*class_uchardata++ = `0xffffu`;
6167	#elif PCRE2_CODE_UNIT_WIDTH == 32
6168	*class_uchardata++ = `0x100`;
6169	*class_uchardata++ = `0xffffffffu`;
6170	#endif
6171	}
6172	}
6173	class_uchardata++ = XCL_END; /* Marks the end of extra data /
6174	*code++ = OP_XCLASS;
6175	code += LINK_SIZE;
6176	*code = negate_class? XCL_NOT:`0`;
6177	if (xclass_has_prop) *code \|= XCL_HASPROP;
6178
6179	/ If the map is required, move up the extra data to make room for it;*
6180	otherwise just move the code pointer to the end of the extra data. /*
6181
6182	if (class_has_8bitchar > `0`)
6183	{
6184	*code++ \|= XCL_MAP;
6185	(void)memmove(code + (`32` / sizeof(PCRE2_UCHAR)), code,
6186	CU2BYTES(class_uchardata - code));
6187	if (negate_class && !xclass_has_prop)
6188	{
6189	/ Using 255 ^ instead of ~ avoids clang sanitize warning. /
6190	for (int i = `0`; i < `32`; i++) classbits[i] = `255` ^ classbits[i];
6191	}
6192	memcpy(code, classbits, `32`);
6193	code = class_uchardata + (`32` / sizeof(PCRE2_UCHAR));
6194	}
6195	else code = class_uchardata;
6196
6197	/ Now fill in the complete length of the item /
6198
6199	PUT(previous, `1`, (int)(code - previous));
6200	break; / End of class handling /
6201	}
6202	#endif /* SUPPORT_WIDE_CHARS */
6203
6204	/ If there are no characters > 255, or they are all to be included or*
6205	excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
6206	whole class was negated and whether there were negative specials such as \S
6207	(non-UCP) in the class. Then copy the 32-byte map into the code vector,
6208	negating it if necessary. /*
6209
6210	*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
6211	if (lengthptr == NULL) / Save time in the pre-compile phase /
6212	{
6213	if (negate_class)
6214	{
6215	/ Using 255 ^ instead of ~ avoids clang sanitize warning. /
6216	for (int i = `0`; i < `32`; i++) classbits[i] = `255` ^ classbits[i];
6217	}
6218	memcpy(code, classbits, `32`);
6219	}
6220	code += `32` / sizeof(PCRE2_UCHAR);
6221	break; / End of class processing /
6222
6223
6224	/ ===================================================================/
6225	/ Deal with (VERB)s. /*
6226
6227	/ Check for open captures before ACCEPT and close those that are within*
6228	the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6229	assertion. In the first pass, just accumulate the length required;
6230	otherwise hitting (ACCEPT) inside many nested parentheses can cause*
6231	workspace overflow. Do not set firstcu after ACCEPT. /
6232
6233	case META_ACCEPT:
6234	cb->had_accept = had_accept = TRUE;
6235	for (oc = cb->open_caps;
6236	oc != NULL && oc->assert_depth >= cb->assert_depth;
6237	oc = oc->next)
6238	{
6239	if (lengthptr != NULL)
6240	{
6241	*lengthptr += CU2BYTES(`1`) + IMM2_SIZE;
6242	}
6243	else
6244	{
6245	*code++ = OP_CLOSE;
6246	PUT2INC(code, `0`, oc->number);
6247	}
6248	}
6249	*code++ = (cb->assert_depth > `0`)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6250	if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6251	break;
6252
6253	case META_PRUNE:
6254	case META_SKIP:
6255	cb->had_pruneorskip = TRUE;
6256	/ Fall through /
6257	case META_COMMIT:
6258	case META_FAIL:
6259	*code++ = verbops[(meta - META_MARK) >> `16`];
6260	break;
6261
6262	case META_THEN:
6263	cb->external_flags \|= PCRE2_HASTHEN;
6264	*code++ = OP_THEN;
6265	break;
6266
6267	/ Handle verbs with arguments. Arguments can be very long, especially in*
6268	16- and 32-bit modes, and can overflow the workspace in the first pass.
6269	However, the argument length is constrained to be small enough to fit in
6270	one code unit. This check happens in parse_regex(). In the first pass,
6271	instead of putting the argument into memory, we just update the length
6272	counter and set up an empty argument. /*
6273
6274	case META_THEN_ARG:
6275	cb->external_flags \|= PCRE2_HASTHEN;
6276	goto VERB_ARG;
6277
6278	case META_PRUNE_ARG:
6279	case META_SKIP_ARG:
6280	cb->had_pruneorskip = TRUE;
6281	/ Fall through /
6282	case META_MARK:
6283	case META_COMMIT_ARG:
6284	VERB_ARG:
6285	*code++ = verbops[(meta - META_MARK) >> `16`];
6286	/ The length is in characters. /
6287	verbarglen = *(++pptr);
6288	verbculen = `0`;
6289	tempcode = code++;
6290	for (int i = `0`; i < (int)verbarglen; i++)
6291	{
6292	meta = *(++pptr);
6293	#ifdef SUPPORT_UNICODE
6294	if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6295	#endif
6296	{
6297	mclength = `1`;
6298	mcbuffer[`0`] = meta;
6299	}
6300	if (lengthptr != NULL) lengthptr += mclength; else*
6301	{
6302	memcpy(code, mcbuffer, CU2BYTES(mclength));
6303	code += mclength;
6304	verbculen += mclength;
6305	}
6306	}
6307
6308	tempcode = verbculen; /* Fill in the code unit length /
6309	code++ = `0`; /* Terminating zero /
6310	break;
6311
6312
6313	/ ===================================================================/
6314	/ Handle options change. The new setting must be passed back for use in*
6315	subsequent branches. Reset the greedy defaults and the case value for
6316	firstcu and reqcu. /*
6317
6318	case META_OPTIONS:
6319	optionsptr = options = (++pptr);
6320	greedy_default = ((options & PCRE2_UNGREEDY) != `0`);
6321	greedy_non_default = greedy_default ^ `1`;
6322	req_caseopt = ((options & PCRE2_CASELESS) != `0`)? REQ_CASELESS : `0`;
6323	break;
6324
6325
6326	/ ===================================================================/
6327	/ Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous*
6328	because it could be a numerical check on recursion, or a name check on a
6329	group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6330	we can handle it either way. We first try for a name; if not found, process
6331	the number. /*
6332
6333	case META_COND_RNUMBER: / (?(Rdigits) /
6334	case META_COND_NAME: / (?(name) or (?'name') or ?(<name>) /
6335	case META_COND_RNAME: / (?(R&name) - test for recursion /
6336	bravalue = OP_COND;
6337	{
6338	int count, index;
6339	unsigned int i;
6340	PCRE2_SPTR name;
6341	named_group *ng = cb->named_groups;
6342	uint32_t length = *(++pptr);
6343
6344	GETPLUSOFFSET(offset, pptr);
6345	name = cb->start_pattern + offset;
6346
6347	/ In the first pass, the names generated in the pre-pass are available,*
6348	but the main name table has not yet been created. Scan the list of names
6349	generated in the pre-pass in order to get a number and whether or not
6350	this name is duplicated. If it is not duplicated, we can handle it as a
6351	numerical group. /*
6352
6353	for (i = `0`; i < cb->names_found; i++, ng++)
6354	{
6355	if (length == ng->length &&
6356	PRIV(strncmp)(name, ng->name, length) == `0`)
6357	{
6358	if (!ng->isdup)
6359	{
6360	code[`1`+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6361	PUT2(code, `2`+LINK_SIZE, ng->number);
6362	if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6363	skipunits = `1`+IMM2_SIZE;
6364	goto GROUP_PROCESS_NOTE_EMPTY;
6365	}
6366	break; / Found a duplicated name /
6367	}
6368	}
6369
6370	/ If the name was not found we have a bad reference, unless we are*
6371	dealing with R<digits>, which is treated as a recursion test by number.
6372	*/
6373
6374	if (i >= cb->names_found)
6375	{
6376	groupnumber = `0`;
6377	if (meta == META_COND_RNUMBER)
6378	{
6379	for (i = `1`; i < length; i++)
6380	{
6381	groupnumber = groupnumber * `10` + name[i] - CHAR_0;
6382	if (groupnumber > MAX_GROUP_NUMBER)
6383	{
6384	*errorcodeptr = ERR61;
6385	cb->erroroffset = offset + i;
6386	return `0`;
6387	}
6388	}
6389	}
6390
6391	if (meta != META_COND_RNUMBER \|\| groupnumber > cb->bracount)
6392	{
6393	*errorcodeptr = ERR15;
6394	cb->erroroffset = offset;
6395	return `0`;
6396	}
6397
6398	/ (?Rdigits) treated as a recursion reference by number. A value of*
6399	zero (which is the result of both (?R) and (?R0)) means "any", and is
6400	translated into RREF_ANY (which is 0xffff). /*
6401
6402	if (groupnumber == `0`) groupnumber = RREF_ANY;
6403	code[`1`+LINK_SIZE] = OP_RREF;
6404	PUT2(code, `2`+LINK_SIZE, groupnumber);
6405	skipunits = `1`+IMM2_SIZE;
6406	goto GROUP_PROCESS_NOTE_EMPTY;
6407	}
6408
6409	/ A duplicated name was found. Note that if an R<digits> name is found*
6410	(META_COND_RNUMBER), it is a reference test, not a recursion test. /*
6411
6412	code[`1`+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6413
6414	/ We have a duplicated name. In the compile pass we have to search the*
6415	main table in order to get the index and count values. /*
6416
6417	count = `0`; / Values for first pass (avoids compiler warning) /
6418	index = `0`;
6419	if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6420	&count, errorcodeptr, cb)) return `0`;
6421
6422	/ Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and*
6423	insert appropriate data values. /*
6424
6425	code[`1`+LINK_SIZE]++;
6426	skipunits = `1`+`2`*IMM2_SIZE;
6427	PUT2(code, `2`+LINK_SIZE, index);
6428	PUT2(code, `2`+LINK_SIZE+IMM2_SIZE, count);
6429	}
6430	goto GROUP_PROCESS_NOTE_EMPTY;
6431
6432	/ The DEFINE condition is always false. Its internal groups may never*
6433	be called, so matched_char must remain false, hence the jump to
6434	GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. /*
6435
6436	case META_COND_DEFINE:
6437	bravalue = OP_COND;
6438	GETPLUSOFFSET(offset, pptr);
6439	code[`1`+LINK_SIZE] = OP_DEFINE;
6440	skipunits = `1`;
6441	goto GROUP_PROCESS;
6442
6443	/ Conditional test of a group's being set. /
6444
6445	case META_COND_NUMBER:
6446	bravalue = OP_COND;
6447	GETPLUSOFFSET(offset, pptr);
6448	groupnumber = *(++pptr);
6449	if (groupnumber > cb->bracount)
6450	{
6451	*errorcodeptr = ERR15;
6452	cb->erroroffset = offset;
6453	return `0`;
6454	}
6455	if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6456	offset -= `2`; / Point at initial ( for too many branches error /
6457	code[`1`+LINK_SIZE] = OP_CREF;
6458	skipunits = `1`+IMM2_SIZE;
6459	PUT2(code, `2`+LINK_SIZE, groupnumber);
6460	goto GROUP_PROCESS_NOTE_EMPTY;
6461
6462	/ Test for the PCRE2 version. /
6463
6464	case META_COND_VERSION:
6465	bravalue = OP_COND;
6466	if (pptr[`1`] > `0`)
6467	code[`1`+LINK_SIZE] = ((PCRE2_MAJOR > pptr[`2`]) \|\|
6468	(PCRE2_MAJOR == pptr[`2`] && PCRE2_MINOR >= pptr[`3`]))?
6469	OP_TRUE : OP_FALSE;
6470	else
6471	code[`1`+LINK_SIZE] = (PCRE2_MAJOR == pptr[`2`] && PCRE2_MINOR == pptr[`3`])?
6472	OP_TRUE : OP_FALSE;
6473	skipunits = `1`;
6474	pptr += `3`;
6475	goto GROUP_PROCESS_NOTE_EMPTY;
6476
6477	/ The condition is an assertion, possibly preceded by a callout. /
6478
6479	case META_COND_ASSERT:
6480	bravalue = OP_COND;
6481	goto GROUP_PROCESS_NOTE_EMPTY;
6482
6483
6484	/ ===================================================================/
6485	/ Handle all kinds of nested bracketed groups. The non-capturing,*
6486	non-conditional cases are here; others come to GROUP_PROCESS via goto. /*
6487
6488	case META_LOOKAHEAD:
6489	bravalue = OP_ASSERT;
6490	cb->assert_depth += `1`;
6491	goto GROUP_PROCESS;
6492
6493	case META_LOOKAHEAD_NA:
6494	bravalue = OP_ASSERT_NA;
6495	cb->assert_depth += `1`;
6496	goto GROUP_PROCESS;
6497
6498	/ Optimize (?!) to (FAIL) unless it is quantified - which is a weird
6499	thing to do, but Perl allows all assertions to be quantified, and when
6500	they contain capturing parentheses there may be a potential use for
6501	this feature. Not that that applies to a quantified (?!) but we allow
6502	it for uniformity. /*
6503
6504	case META_LOOKAHEADNOT:
6505	if (pptr[`1`] == META_KET &&
6506	(pptr[`2`] < META_ASTERISK \|\| pptr[`2`] > META_MINMAX_QUERY))
6507	{
6508	*code++ = OP_FAIL;
6509	pptr++;
6510	}
6511	else
6512	{
6513	bravalue = OP_ASSERT_NOT;
6514	cb->assert_depth += `1`;
6515	goto GROUP_PROCESS;
6516	}
6517	break;
6518
6519	case META_LOOKBEHIND:
6520	bravalue = OP_ASSERTBACK;
6521	cb->assert_depth += `1`;
6522	goto GROUP_PROCESS;
6523
6524	case META_LOOKBEHINDNOT:
6525	bravalue = OP_ASSERTBACK_NOT;
6526	cb->assert_depth += `1`;
6527	goto GROUP_PROCESS;
6528
6529	case META_LOOKBEHIND_NA:
6530	bravalue = OP_ASSERTBACK_NA;
6531	cb->assert_depth += `1`;
6532	goto GROUP_PROCESS;
6533
6534	case META_ATOMIC:
6535	bravalue = OP_ONCE;
6536	goto GROUP_PROCESS_NOTE_EMPTY;
6537
6538	case META_SCRIPT_RUN:
6539	bravalue = OP_SCRIPT_RUN;
6540	goto GROUP_PROCESS_NOTE_EMPTY;
6541
6542	case META_NOCAPTURE:
6543	bravalue = OP_BRA;
6544	/ Fall through /
6545
6546	/ Process nested bracketed regex. The nesting depth is maintained for the*
6547	benefit of the stackguard function. The test for too deep nesting is now
6548	done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6549	others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6550	note of whether or not they may match an empty string. /*
6551
6552	GROUP_PROCESS_NOTE_EMPTY:
6553	note_group_empty = TRUE;
6554
6555	GROUP_PROCESS:
6556	cb->parens_depth += `1`;
6557	*code = bravalue;
6558	pptr++;
6559	tempcode = code;
6560	tempreqvary = cb->req_varyopt; / Save value before group /
6561	length_prevgroup = `0`; / Initialize for pre-compile phase /
6562
6563	if ((group_return =
6564	compile_regex(
6565	options, / The option state /
6566	&tempcode, / Where to put code (updated) /
6567	&pptr, / Input pointer (updated) /
6568	errorcodeptr, / Where to put an error message /
6569	skipunits, / Skip over bracket number /
6570	&subfirstcu, / For possible first char /
6571	&subfirstcuflags,
6572	&subreqcu, / For possible last char /
6573	&subreqcuflags,
6574	bcptr, / Current branch chain /
6575	cb, / Compile data block /
6576	(lengthptr == NULL)? NULL : / Actual compile phase /
6577	&length_prevgroup / Pre-compile phase /
6578	)) == `0`)
6579	return `0`; / Error /
6580
6581	cb->parens_depth -= `1`;
6582
6583	/ If that was a non-conditional significant group (not an assertion, not a*
6584	DEFINE) that matches at least one character, then the current item matches
6585	a character. Conditionals are handled below. /*
6586
6587	if (note_group_empty && bravalue != OP_COND && group_return > `0`)
6588	matched_char = TRUE;
6589
6590	/ If we've just compiled an assertion, pop the assert depth. /
6591
6592	if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
6593	cb->assert_depth -= `1`;
6594
6595	/ At the end of compiling, code is still pointing to the start of the*
6596	group, while tempcode has been updated to point past the end of the group.
6597	The parsed pattern pointer (pptr) is on the closing META_KET.
6598
6599	If this is a conditional bracket, check that there are no more than
6600	two branches in the group, or just one if it's a DEFINE group. We do this
6601	in the real compile phase, not in the pre-pass, where the whole group may
6602	not be available. /*
6603
6604	if (bravalue == OP_COND && lengthptr == NULL)
6605	{
6606	PCRE2_UCHAR *tc = code;
6607	int condcount = `0`;
6608
6609	do {
6610	condcount++;
6611	tc += GET(tc,`1`);
6612	}
6613	while (*tc != OP_KET);
6614
6615	/ A DEFINE group is never obeyed inline (the "condition" is always*
6616	false). It must have only one branch. Having checked this, change the
6617	opcode to OP_FALSE. /*
6618
6619	if (code[LINK_SIZE+`1`] == OP_DEFINE)
6620	{
6621	if (condcount > `1`)
6622	{
6623	cb->erroroffset = offset;
6624	*errorcodeptr = ERR54;
6625	return `0`;
6626	}
6627	code[LINK_SIZE+`1`] = OP_FALSE;
6628	bravalue = OP_DEFINE; / A flag to suppress char handling below /
6629	}
6630
6631	/ A "normal" conditional group. If there is just one branch, we must not*
6632	make use of its firstcu or reqcu, because this is equivalent to an
6633	empty second branch. Also, it may match an empty string. If there are two
6634	branches, this item must match a character if the group must. /*
6635
6636	else
6637	{
6638	if (condcount > `2`)
6639	{
6640	cb->erroroffset = offset;
6641	*errorcodeptr = ERR27;
6642	return `0`;
6643	}
6644	if (condcount == `1`) subfirstcuflags = subreqcuflags = REQ_NONE;
6645	else if (group_return > `0`) matched_char = TRUE;
6646	}
6647	}
6648
6649	/ In the pre-compile phase, update the length by the length of the group,*
6650	less the brackets at either end. Then reduce the compiled code to just a
6651	set of non-capturing brackets so that it doesn't use much memory if it is
6652	duplicated by a quantifier./*
6653
6654	if (lengthptr != NULL)
6655	{
6656	if (OFLOW_MAX - lengthptr < length_prevgroup - `2` - `2`LINK_SIZE)
6657	{
6658	*errorcodeptr = ERR20;
6659	return `0`;
6660	}
6661	lengthptr += length_prevgroup - `2` - `2`LINK_SIZE;
6662	code++; / This already contains bravalue /
6663	PUTINC(code, `0`, `1` + LINK_SIZE);
6664	*code++ = OP_KET;
6665	PUTINC(code, `0`, `1` + LINK_SIZE);
6666	break; / No need to waste time with special character handling /
6667	}
6668
6669	/ Otherwise update the main code pointer to the end of the group. /
6670
6671	code = tempcode;
6672
6673	/ For a DEFINE group, required and first character settings are not*
6674	relevant. /*
6675
6676	if (bravalue == OP_DEFINE) break;
6677
6678	/ Handle updating of the required and first code units for other types of*
6679	group. Update for normal brackets of all kinds, and conditions with two
6680	branches (see code above). If the bracket is followed by a quantifier with
6681	zero repeat, we have to back off. Hence the definition of zeroreqcu and
6682	zerofirstcu outside the main loop so that they can be accessed for the back
6683	off. /*
6684
6685	zeroreqcu = reqcu;
6686	zeroreqcuflags = reqcuflags;
6687	zerofirstcu = firstcu;
6688	zerofirstcuflags = firstcuflags;
6689	groupsetfirstcu = FALSE;
6690
6691	if (bravalue >= OP_ONCE) / Not an assertion /
6692	{
6693	/ If we have not yet set a firstcu in this branch, take it from the*
6694	subpattern, remembering that it was set here so that a repeat of more
6695	than one can replicate it as reqcu if necessary. If the subpattern has
6696	no firstcu, set "none" for the whole branch. In both cases, a zero
6697	repeat forces firstcu to "none". /*
6698
6699	if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6700	{
6701	if (subfirstcuflags < REQ_NONE)
6702	{
6703	firstcu = subfirstcu;
6704	firstcuflags = subfirstcuflags;
6705	groupsetfirstcu = TRUE;
6706	}
6707	else firstcuflags = REQ_NONE;
6708	zerofirstcuflags = REQ_NONE;
6709	}
6710
6711	/ If firstcu was previously set, convert the subpattern's firstcu*
6712	into reqcu if there wasn't one, using the vary flag that was in
6713	existence beforehand. /*
6714
6715	else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
6716	{
6717	subreqcu = subfirstcu;
6718	subreqcuflags = subfirstcuflags \| tempreqvary;
6719	}
6720
6721	/ If the subpattern set a required code unit (or set a first code unit*
6722	that isn't really the first code unit - see above), set it. /*
6723
6724	if (subreqcuflags < REQ_NONE)
6725	{
6726	reqcu = subreqcu;
6727	reqcuflags = subreqcuflags;
6728	}
6729	}
6730
6731	/ For a forward assertion, we take the reqcu, if set, provided that the*
6732	group has also set a firstcu. This can be helpful if the pattern that
6733	follows the assertion doesn't set a different char. For example, it's
6734	useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
6735	because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
6736	the "real" "a" would then become a reqcu instead of a firstcu. This is
6737	overcome by a scan at the end if there's no firstcu, looking for an
6738	asserted first char. A similar effect for patterns like /(?=.X)X$/ means*
6739	we must only take the reqcu when the group also set a firstcu. Otherwise,
6740	in that example, 'X' ends up set for both. /*
6741
6742	else if ((bravalue == OP_ASSERT \|\| bravalue == OP_ASSERT_NA) &&
6743	subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
6744	{
6745	reqcu = subreqcu;
6746	reqcuflags = subreqcuflags;
6747	}
6748
6749	break; / End of nested group handling /
6750
6751
6752	/ ===================================================================/
6753	/ Handle named backreferences and recursions. /
6754
6755	case META_BACKREF_BYNAME:
6756	case META_RECURSE_BYNAME:
6757	{
6758	int count, index;
6759	PCRE2_SPTR name;
6760	BOOL is_dupname = FALSE;
6761	named_group *ng = cb->named_groups;
6762	uint32_t length = *(++pptr);
6763
6764	GETPLUSOFFSET(offset, pptr);
6765	name = cb->start_pattern + offset;
6766
6767	/ In the first pass, the names generated in the pre-pass are available,*
6768	but the main name table has not yet been created. Scan the list of names
6769	generated in the pre-pass in order to get a number and whether or not
6770	this name is duplicated. /*
6771
6772	groupnumber = `0`;
6773	for (unsigned int i = `0`; i < cb->names_found; i++, ng++)
6774	{
6775	if (length == ng->length &&
6776	PRIV(strncmp)(name, ng->name, length) == `0`)
6777	{
6778	is_dupname = ng->isdup;
6779	groupnumber = ng->number;
6780
6781	/ For a recursion, that's all that is needed. We can now go to*
6782	the code that handles numerical recursion, applying it to the first
6783	group with the given name. /*
6784
6785	if (meta == META_RECURSE_BYNAME)
6786	{
6787	meta_arg = groupnumber;
6788	goto HANDLE_NUMERICAL_RECURSION;
6789	}
6790
6791	/ For a back reference, update the back reference map and the*
6792	maximum back reference. /*
6793
6794	cb->backref_map \|= (groupnumber < `32`)? (`1u` << groupnumber) : `1`;
6795	if (groupnumber > cb->top_backref)
6796	cb->top_backref = groupnumber;
6797	}
6798	}
6799
6800	/ If the name was not found we have a bad reference. /
6801
6802	if (groupnumber == `0`)
6803	{
6804	*errorcodeptr = ERR15;
6805	cb->erroroffset = offset;
6806	return `0`;
6807	}
6808
6809	/ If a back reference name is not duplicated, we can handle it as*
6810	a numerical reference. /*
6811
6812	if (!is_dupname)
6813	{
6814	meta_arg = groupnumber;
6815	goto HANDLE_SINGLE_REFERENCE;
6816	}
6817
6818	/ If a back reference name is duplicated, we generate a different*
6819	opcode to a numerical back reference. In the second pass we must
6820	search for the index and count in the final name table. /*
6821
6822	count = `0`; / Values for first pass (avoids compiler warning) /
6823	index = `0`;
6824	if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6825	&count, errorcodeptr, cb)) return `0`;
6826
6827	if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6828	*code++ = ((options & PCRE2_CASELESS) != `0`)? OP_DNREFI : OP_DNREF;
6829	PUT2INC(code, `0`, index);
6830	PUT2INC(code, `0`, count);
6831	}
6832	break;
6833
6834
6835	/ ===================================================================/
6836	/ Handle a numerical callout. /
6837
6838	case META_CALLOUT_NUMBER:
6839	code[`0`] = OP_CALLOUT;
6840	PUT(code, `1`, pptr[`1`]); / Offset to next pattern item /
6841	PUT(code, `1` + LINK_SIZE, pptr[`2`]); / Length of next pattern item /
6842	code[`1` + `2`*LINK_SIZE] = pptr[`3`];
6843	pptr += `3`;
6844	code += PRIV(OP_lengths)[OP_CALLOUT];
6845	break;
6846
6847
6848	/ ===================================================================/
6849	/ Handle a callout with a string argument. In the pre-pass we just compute*
6850	the length without generating anything. The length in pptr[3] includes both
6851	delimiters; in the actual compile only the first one is copied, but a
6852	terminating zero is added. Any doubled delimiters within the string make
6853	this an overestimate, but it is not worth bothering about. /*
6854
6855	case META_CALLOUT_STRING:
6856	if (lengthptr != NULL)
6857	{
6858	lengthptr += pptr[`3`] + (`1` + `4`LINK_SIZE);
6859	pptr += `3`;
6860	SKIPOFFSET(pptr);
6861	}
6862
6863	/ In the real compile we can copy the string. The starting delimiter is*
6864	included so that the client can discover it if they want. We also pass the
6865	start offset to help a script language give better error messages. /*
6866
6867	else
6868	{
6869	PCRE2_SPTR pp;
6870	uint32_t delimiter;
6871	uint32_t length = pptr[`3`];
6872	PCRE2_UCHAR callout_string = code + (`1` + `4`LINK_SIZE);
6873
6874	code[`0`] = OP_CALLOUT_STR;
6875	PUT(code, `1`, pptr[`1`]); / Offset to next pattern item /
6876	PUT(code, `1` + LINK_SIZE, pptr[`2`]); / Length of next pattern item /
6877
6878	pptr += `3`;
6879	GETPLUSOFFSET(offset, pptr); / Offset to string in pattern /
6880	pp = cb->start_pattern + offset;
6881	delimiter = callout_string++ = pp++;
6882	if (delimiter == CHAR_LEFT_CURLY_BRACKET)
6883	delimiter = CHAR_RIGHT_CURLY_BRACKET;
6884	PUT(code, `1` + `3`LINK_SIZE, (int)(offset + `1`)); /* One after delimiter /
6885
6886	/ The syntax of the pattern was checked in the parsing scan. The length*
6887	includes both delimiters, but we have passed the opening one just above,
6888	so we reduce length before testing it. The test is for > 1 because we do
6889	not want to copy the final delimiter. This also ensures that pp[1] is
6890	accessible. /*
6891
6892	while (--length > `1`)
6893	{
6894	if (*pp == delimiter && pp[`1`] == delimiter)
6895	{
6896	*callout_string++ = delimiter;
6897	pp += `2`;
6898	length--;
6899	}
6900	else callout_string++ = pp++;
6901	}
6902	*callout_string++ = CHAR_NUL;
6903
6904	/ Set the length of the entire item, the advance to its end. /
6905
6906	PUT(code, `1` + `2`LINK_SIZE, (int*)(callout_string - code));
6907	code = callout_string;
6908	}
6909	break;
6910
6911
6912	/ ===================================================================/
6913	/ Handle repetition. The different types are all sorted out in the parsing*
6914	pass. /*
6915
6916	case META_MINMAX_PLUS:
6917	case META_MINMAX_QUERY:
6918	case META_MINMAX:
6919	repeat_min = *(++pptr);
6920	repeat_max = *(++pptr);
6921	goto REPEAT;
6922
6923	case META_ASTERISK:
6924	case META_ASTERISK_PLUS:
6925	case META_ASTERISK_QUERY:
6926	repeat_min = `0`;
6927	repeat_max = REPEAT_UNLIMITED;
6928	goto REPEAT;
6929
6930	case META_PLUS:
6931	case META_PLUS_PLUS:
6932	case META_PLUS_QUERY:
6933	repeat_min = `1`;
6934	repeat_max = REPEAT_UNLIMITED;
6935	goto REPEAT;
6936
6937	case META_QUERY:
6938	case META_QUERY_PLUS:
6939	case META_QUERY_QUERY:
6940	repeat_min = `0`;
6941	repeat_max = `1`;
6942
6943	REPEAT:
6944	if (previous_matched_char && repeat_min > `0`) matched_char = TRUE;
6945
6946	/ Remember whether this is a variable length repeat, and default to*
6947	single-char opcodes. /*
6948
6949	reqvary = (repeat_min == repeat_max)? `0` : REQ_VARY;
6950	op_type = `0`;
6951
6952	/ Adjust first and required code units for a zero repeat. /
6953
6954	if (repeat_min == `0`)
6955	{
6956	firstcu = zerofirstcu;
6957	firstcuflags = zerofirstcuflags;
6958	reqcu = zeroreqcu;
6959	reqcuflags = zeroreqcuflags;
6960	}
6961
6962	/ Note the greediness and possessiveness. /
6963
6964	switch (meta)
6965	{
6966	case META_MINMAX_PLUS:
6967	case META_ASTERISK_PLUS:
6968	case META_PLUS_PLUS:
6969	case META_QUERY_PLUS:
6970	repeat_type = `0`; / Force greedy /
6971	possessive_quantifier = TRUE;
6972	break;
6973
6974	case META_MINMAX_QUERY:
6975	case META_ASTERISK_QUERY:
6976	case META_PLUS_QUERY:
6977	case META_QUERY_QUERY:
6978	repeat_type = greedy_non_default;
6979	possessive_quantifier = FALSE;
6980	break;
6981
6982	default:
6983	repeat_type = greedy_default;
6984	possessive_quantifier = FALSE;
6985	break;
6986	}
6987
6988	/ Save start of previous item, in case we have to move it up in order to*
6989	insert something before it, and remember what it was. /*
6990
6991	tempcode = previous;
6992	op_previous = *previous;
6993
6994	/ Now handle repetition for the different types of item. If the repeat*
6995	minimum and the repeat maximum are both 1, we can ignore the quantifier for
6996	non-parenthesized items, as they have only one alternative. For anything in
6997	parentheses, we must not ignore if {1} is possessive. /*
6998
6999	switch (op_previous)
7000	{
7001	/ If previous was a character or negated character match, abolish the*
7002	item and generate a repeat item instead. If a char item has a minimum of
7003	more than one, ensure that it is set in reqcu - it might not be if a
7004	sequence such as x{3} is the first thing in a branch because the x will
7005	have gone into firstcu instead. /*
7006
7007	case OP_CHAR:
7008	case OP_CHARI:
7009	case OP_NOT:
7010	case OP_NOTI:
7011	if (repeat_max == `1` && repeat_min == `1`) goto END_REPEAT;
7012	op_type = chartypeoffset[op_previous - OP_CHAR];
7013
7014	/ Deal with UTF characters that take up more than one code unit. /
7015
7016	#ifdef MAYBE_UTF_MULTI
7017	if (utf && NOT_FIRSTCU(code[-`1`]))
7018	{
7019	PCRE2_UCHAR *lastchar = code - `1`;
7020	BACKCHAR(lastchar);
7021	mclength = (uint32_t)(code - lastchar); / Length of UTF character /
7022	memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); / Save the char /
7023	}
7024	else
7025	#endif /* MAYBE_UTF_MULTI */
7026
7027	/ Handle the case of a single code unit - either with no UTF support, or*
7028	with UTF disabled, or for a single-code-unit UTF character. In the latter
7029	case, for a repeated positive match, get the caseless flag for the
7030	required code unit from the previous character, because a class like [Aa]
7031	sets a caseless A but by now the req_caseopt flag has been reset. /*
7032
7033	{
7034	mcbuffer[`0`] = code[-`1`];
7035	mclength = `1`;
7036	if (op_previous <= OP_CHARI && repeat_min > `1`)
7037	{
7038	reqcu = mcbuffer[`0`];
7039	reqcuflags = cb->req_varyopt;
7040	if (op_previous == OP_CHARI) reqcuflags \|= REQ_CASELESS;
7041	}
7042	}
7043	goto OUTPUT_SINGLE_REPEAT; / Code shared with single character types /
7044
7045	/ If previous was a character class or a back reference, we put the*
7046	repeat stuff after it, but just skip the item if the repeat was {0,0}. /*
7047
7048	#ifdef SUPPORT_WIDE_CHARS
7049	case OP_XCLASS:
7050	#endif
7051	case OP_CLASS:
7052	case OP_NCLASS:
7053	case OP_REF:
7054	case OP_REFI:
7055	case OP_DNREF:
7056	case OP_DNREFI:
7057
7058	if (repeat_max == `0`)
7059	{
7060	code = previous;
7061	goto END_REPEAT;
7062	}
7063	if (repeat_max == `1` && repeat_min == `1`) goto END_REPEAT;
7064
7065	if (repeat_min == `0` && repeat_max == REPEAT_UNLIMITED)
7066	*code++ = OP_CRSTAR + repeat_type;
7067	else if (repeat_min == `1` && repeat_max == REPEAT_UNLIMITED)
7068	*code++ = OP_CRPLUS + repeat_type;
7069	else if (repeat_min == `0` && repeat_max == `1`)
7070	*code++ = OP_CRQUERY + repeat_type;
7071	else
7072	{
7073	*code++ = OP_CRRANGE + repeat_type;
7074	PUT2INC(code, `0`, repeat_min);
7075	if (repeat_max == REPEAT_UNLIMITED) repeat_max = `0`; / 2-byte encoding for max /
7076	PUT2INC(code, `0`, repeat_max);
7077	}
7078	break;
7079
7080	/ If previous is OP_FAIL, it was generated by an empty class []*
7081	(PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
7082	generated, that is by (FAIL) or (?!), disallow a quantifier at parse*
7083	time. We can just ignore this repeat. /*
7084
7085	case OP_FAIL:
7086	goto END_REPEAT;
7087
7088	/ Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets*
7089	because pcre2_match() could not handle backtracking into recursively
7090	called groups. Now that this backtracking is available, we no longer need
7091	to do this. However, we still need to replicate recursions as we do for
7092	groups so as to have independent backtracking points. We can replicate
7093	for the minimum number of repeats directly. For optional repeats we now
7094	wrap the recursion in OP_BRA brackets and make use of the bracket
7095	repetition. /*
7096
7097	case OP_RECURSE:
7098	if (repeat_max == `1` && repeat_min == `1` && !possessive_quantifier)
7099	goto END_REPEAT;
7100
7101	/ Generate unwrapped repeats for a non-zero minimum, except when the*
7102	minimum is 1 and the maximum unlimited, because that can be handled with
7103	OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7104	minimum, we just need to generate the appropriate additional copies.
7105	Otherwise we need to generate one more, to simulate the situation when
7106	the minimum is zero. /*
7107
7108	if (repeat_min > `0` && (repeat_min != `1` \|\| repeat_max != REPEAT_UNLIMITED))
7109	{
7110	int replicate = repeat_min;
7111	if (repeat_min == repeat_max) replicate--;
7112
7113	/ In the pre-compile phase, we don't actually do the replication. We*
7114	just adjust the length as if we had. Do some paranoid checks for
7115	potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7116	integer type when available, otherwise double. /*
7117
7118	if (lengthptr != NULL)
7119	{
7120	PCRE2_SIZE delta = replicate*(`1` + LINK_SIZE);
7121	if ((INT64_OR_DOUBLE)replicate*
7122	(INT64_OR_DOUBLE)(`1` + LINK_SIZE) >
7123	(INT64_OR_DOUBLE)INT_MAX \|\|
7124	OFLOW_MAX - *lengthptr < delta)
7125	{
7126	*errorcodeptr = ERR20;
7127	return `0`;
7128	}
7129	*lengthptr += delta;
7130	}
7131
7132	else for (int i = `0`; i < replicate; i++)
7133	{
7134	memcpy(code, previous, CU2BYTES(`1` + LINK_SIZE));
7135	previous = code;
7136	code += `1` + LINK_SIZE;
7137	}
7138
7139	/ If the number of repeats is fixed, we are done. Otherwise, adjust*
7140	the counts and fall through. /*
7141
7142	if (repeat_min == repeat_max) break;
7143	if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7144	repeat_min = `0`;
7145	}
7146
7147	/ Wrap the recursion call in OP_BRA brackets. /
7148
7149	(void)memmove(previous + `1` + LINK_SIZE, previous, CU2BYTES(`1` + LINK_SIZE));
7150	op_previous = *previous = OP_BRA;
7151	PUT(previous, `1`, `2` + `2`*LINK_SIZE);
7152	previous[`2` + `2`*LINK_SIZE] = OP_KET;
7153	PUT(previous, `3` + `2`LINK_SIZE, `2` + `2`LINK_SIZE);
7154	code += `2` + `2` * LINK_SIZE;
7155	length_prevgroup = `3` + `3`*LINK_SIZE;
7156	group_return = -`1`; / Set "may match empty string" /
7157
7158	/ Now treat as a repeated OP_BRA. /
7159	/ Fall through /
7160
7161	/ If previous was a bracket group, we may have to replicate it in*
7162	certain cases. Note that at this point we can encounter only the "basic"
7163	bracket opcodes such as BRA and CBRA, as this is the place where they get
7164	converted into the more special varieties such as BRAPOS and SBRA.
7165	Originally, PCRE did not allow repetition of assertions, but now it does,
7166	for Perl compatibility. /*
7167
7168	case OP_ASSERT:
7169	case OP_ASSERT_NOT:
7170	case OP_ASSERT_NA:
7171	case OP_ASSERTBACK:
7172	case OP_ASSERTBACK_NOT:
7173	case OP_ASSERTBACK_NA:
7174	case OP_ONCE:
7175	case OP_SCRIPT_RUN:
7176	case OP_BRA:
7177	case OP_CBRA:
7178	case OP_COND:
7179	{
7180	int len = (int)(code - previous);
7181	PCRE2_UCHAR *bralink = NULL;
7182	PCRE2_UCHAR *brazeroptr = NULL;
7183
7184	if (repeat_max == `1` && repeat_min == `1` && !possessive_quantifier)
7185	goto END_REPEAT;
7186
7187	/ Repeating a DEFINE group (or any group where the condition is always*
7188	FALSE and there is only one branch) is pointless, but Perl allows the
7189	syntax, so we just ignore the repeat. /*
7190
7191	if (op_previous == OP_COND && previous[LINK_SIZE+`1`] == OP_FALSE &&
7192	previous[GET(previous, `1`)] != OP_ALT)
7193	goto END_REPEAT;
7194
7195	/ Perl allows all assertions to be quantified, and when they contain*
7196	capturing parentheses and/or are optional there are potential uses for
7197	this feature. PCRE2 used to force the maximum quantifier to 1 on the
7198	invalid grounds that further repetition was never useful. This was
7199	always a bit pointless, since an assertion could be wrapped with a
7200	repeated group to achieve the effect. General repetition is now
7201	permitted, but if the maximum is unlimited it is set to one more than
7202	the minimum. /*
7203
7204	if (op_previous < OP_ONCE) / Assertion /
7205	{
7206	if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + `1`;
7207	}
7208
7209	/ The case of a zero minimum is special because of the need to stick*
7210	OP_BRAZERO in front of it, and because the group appears once in the
7211	data, whereas in other cases it appears the minimum number of times. For
7212	this reason, it is simplest to treat this case separately, as otherwise
7213	the code gets far too messy. There are several special subcases when the
7214	minimum is zero. /*
7215
7216	if (repeat_min == `0`)
7217	{
7218	/ If the maximum is also zero, we used to just omit the group from*
7219	the output altogether, like this:
7220
7221	** if (repeat_max == 0)
7222	** {
7223	** code = previous;
7224	** goto END_REPEAT;
7225	** }
7226
7227	However, that fails when a group or a subgroup within it is
7228	referenced as a subroutine from elsewhere in the pattern, so now we
7229	stick in OP_SKIPZERO in front of it so that it is skipped on
7230	execution. As we don't have a list of which groups are referenced, we
7231	cannot do this selectively.
7232
7233	If the maximum is 1 or unlimited, we just have to stick in the
7234	BRAZERO and do no more at this point. /*
7235
7236	if (repeat_max <= `1` \|\| repeat_max == REPEAT_UNLIMITED)
7237	{
7238	(void)memmove(previous + `1`, previous, CU2BYTES(len));
7239	code++;
7240	if (repeat_max == `0`)
7241	{
7242	*previous++ = OP_SKIPZERO;
7243	goto END_REPEAT;
7244	}
7245	brazeroptr = previous; / Save for possessive optimizing /
7246	*previous++ = OP_BRAZERO + repeat_type;
7247	}
7248
7249	/ If the maximum is greater than 1 and limited, we have to replicate*
7250	in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7251	The first one has to be handled carefully because it's the original
7252	copy, which has to be moved up. The remainder can be handled by code
7253	that is common with the non-zero minimum case below. We have to
7254	adjust the value or repeat_max, since one less copy is required. /*
7255
7256	else
7257	{
7258	int linkoffset;
7259	(void)memmove(previous + `2` + LINK_SIZE, previous, CU2BYTES(len));
7260	code += `2` + LINK_SIZE;
7261	*previous++ = OP_BRAZERO + repeat_type;
7262	*previous++ = OP_BRA;
7263
7264	/ We chain together the bracket link offset fields that have to be*
7265	filled in later when the ends of the brackets are reached. /*
7266
7267	linkoffset = (bralink == NULL)? `0` : (int)(previous - bralink);
7268	bralink = previous;
7269	PUTINC(previous, `0`, linkoffset);
7270	}
7271
7272	if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7273	}
7274
7275	/ If the minimum is greater than zero, replicate the group as many*
7276	times as necessary, and adjust the maximum to the number of subsequent
7277	copies that we need. /*
7278
7279	else
7280	{
7281	if (repeat_min > `1`)
7282	{
7283	/ In the pre-compile phase, we don't actually do the replication.*
7284	We just adjust the length as if we had. Do some paranoid checks for
7285	potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7286	integer type when available, otherwise double. /*
7287
7288	if (lengthptr != NULL)
7289	{
7290	PCRE2_SIZE delta = (repeat_min - `1`)*length_prevgroup;
7291	if ((INT64_OR_DOUBLE)(repeat_min - `1`)*
7292	(INT64_OR_DOUBLE)length_prevgroup >
7293	(INT64_OR_DOUBLE)INT_MAX \|\|
7294	OFLOW_MAX - *lengthptr < delta)
7295	{
7296	*errorcodeptr = ERR20;
7297	return `0`;
7298	}
7299	*lengthptr += delta;
7300	}
7301
7302	/ This is compiling for real. If there is a set first code unit*
7303	for the group, and we have not yet set a "required code unit", set
7304	it. /*
7305
7306	else
7307	{
7308	if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7309	{
7310	reqcu = firstcu;
7311	reqcuflags = firstcuflags;
7312	}
7313	for (uint32_t i = `1`; i < repeat_min; i++)
7314	{
7315	memcpy(code, previous, CU2BYTES(len));
7316	code += len;
7317	}
7318	}
7319	}
7320
7321	if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7322	}
7323
7324	/ This code is common to both the zero and non-zero minimum cases. If*
7325	the maximum is limited, it replicates the group in a nested fashion,
7326	remembering the bracket starts on a stack. In the case of a zero
7327	minimum, the first one was set up above. In all cases the repeat_max
7328	now specifies the number of additional copies needed. Again, we must
7329	remember to replicate entries on the forward reference list. /*
7330
7331	if (repeat_max != REPEAT_UNLIMITED)
7332	{
7333	/ In the pre-compile phase, we don't actually do the replication. We*
7334	just adjust the length as if we had. For each repetition we must add
7335	1 to the length for BRAZERO and for all but the last repetition we
7336	must add 2 + 2LINKSIZE to allow for the nesting that occurs. Do some*
7337	paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type
7338	is a 64-bit integer type when available, otherwise double. /*
7339
7340	if (lengthptr != NULL && repeat_max > `0`)
7341	{
7342	PCRE2_SIZE delta = repeat_max(length_prevgroup + `1` + `2` + `2`LINK_SIZE) -
7343	`2` - `2`LINK_SIZE; /* Last one doesn't nest /
7344	if ((INT64_OR_DOUBLE)repeat_max *
7345	(INT64_OR_DOUBLE)(length_prevgroup + `1` + `2` + `2`*LINK_SIZE)
7346	> (INT64_OR_DOUBLE)INT_MAX \|\|
7347	OFLOW_MAX - *lengthptr < delta)
7348	{
7349	*errorcodeptr = ERR20;
7350	return `0`;
7351	}
7352	*lengthptr += delta;
7353	}
7354
7355	/ This is compiling for real /
7356
7357	else for (uint32_t i = repeat_max; i >= `1`; i--)
7358	{
7359	*code++ = OP_BRAZERO + repeat_type;
7360
7361	/ All but the final copy start a new nesting, maintaining the*
7362	chain of brackets outstanding. /*
7363
7364	if (i != `1`)
7365	{
7366	int linkoffset;
7367	*code++ = OP_BRA;
7368	linkoffset = (bralink == NULL)? `0` : (int)(code - bralink);
7369	bralink = code;
7370	PUTINC(code, `0`, linkoffset);
7371	}
7372
7373	memcpy(code, previous, CU2BYTES(len));
7374	code += len;
7375	}
7376
7377	/ Now chain through the pending brackets, and fill in their length*
7378	fields (which are holding the chain links pro tem). /*
7379
7380	while (bralink != NULL)
7381	{
7382	int oldlinkoffset;
7383	int linkoffset = (int)(code - bralink + `1`);
7384	PCRE2_UCHAR *bra = code - linkoffset;
7385	oldlinkoffset = GET(bra, `1`);
7386	bralink = (oldlinkoffset == `0`)? NULL : bralink - oldlinkoffset;
7387	*code++ = OP_KET;
7388	PUTINC(code, `0`, linkoffset);
7389	PUT(bra, `1`, linkoffset);
7390	}
7391	}
7392
7393	/ If the maximum is unlimited, set a repeater in the final copy. For*
7394	SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7395	possessively repeated ONCE brackets can be converted into non-capturing
7396	brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7397	saves having to deal with possessive ONCEs specially.
7398
7399	Otherwise, when we are doing the actual compile phase, check to see
7400	whether this group is one that could match an empty string. If so,
7401	convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7402	that runtime checking can be done. [This check is also applied to ONCE
7403	and SCRIPT_RUN groups at runtime, but in a different way.]
7404
7405	Then, if the quantifier was possessive and the bracket is not a
7406	conditional, we convert the BRA code to the POS form, and the KET code
7407	to KETRPOS. (It turns out to be convenient at runtime to detect this
7408	kind of subpattern at both the start and at the end.) The use of
7409	special opcodes makes it possible to reduce greatly the stack usage in
7410	pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7411	OP_BRAPOSZERO.
7412
7413	Then, if the minimum number of matches is 1 or 0, cancel the possessive
7414	flag so that the default action below, of wrapping everything inside
7415	atomic brackets, does not happen. When the minimum is greater than 1,
7416	there will be earlier copies of the group, and so we still have to wrap
7417	the whole thing. /*
7418
7419	else
7420	{
7421	PCRE2_UCHAR *ketcode = code - `1` - LINK_SIZE;
7422	PCRE2_UCHAR *bracode = ketcode - GET(ketcode, `1`);
7423
7424	/ Convert possessive ONCE brackets to non-capturing /
7425
7426	if (bracode == OP_ONCE && possessive_quantifier) bracode = OP_BRA;
7427
7428	/ For non-possessive ONCE and for SCRIPT_RUN brackets, all we need*
7429	to do is to set the KET. /*
7430
7431	if (bracode == OP_ONCE \|\| bracode == OP_SCRIPT_RUN)
7432	*ketcode = OP_KETRMAX + repeat_type;
7433
7434	/ Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs*
7435	(which have been converted to non-capturing above). /*
7436
7437	else
7438	{
7439	/ In the compile phase, adjust the opcode if the group can match*
7440	an empty string. For a conditional group with only one branch, the
7441	value of group_return will not show "could be empty", so we must
7442	check that separately. /*
7443
7444	if (lengthptr == NULL)
7445	{
7446	if (group_return < `0`) *bracode += OP_SBRA - OP_BRA;
7447	if (*bracode == OP_COND && bracode[GET(bracode,`1`)] != OP_ALT)
7448	*bracode = OP_SCOND;
7449	}
7450
7451	/ Handle possessive quantifiers. /
7452
7453	if (possessive_quantifier)
7454	{
7455	/ For COND brackets, we wrap the whole thing in a possessively*
7456	repeated non-capturing bracket, because we have not invented POS
7457	versions of the COND opcodes. /*
7458
7459	if (bracode == OP_COND \|\| bracode == OP_SCOND)
7460	{
7461	int nlen = (int)(code - bracode);
7462	(void)memmove(bracode + `1` + LINK_SIZE, bracode, CU2BYTES(nlen));
7463	code += `1` + LINK_SIZE;
7464	nlen += `1` + LINK_SIZE;
7465	bracode = (bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7466	*code++ = OP_KETRPOS;
7467	PUTINC(code, `0`, nlen);
7468	PUT(bracode, `1`, nlen);
7469	}
7470
7471	/ For non-COND brackets, we modify the BRA code and use KETRPOS. /
7472
7473	else
7474	{
7475	bracode += `1`; /* Switch to xxxPOS opcodes /
7476	*ketcode = OP_KETRPOS;
7477	}
7478
7479	/ If the minimum is zero, mark it as possessive, then unset the*
7480	possessive flag when the minimum is 0 or 1. /*
7481
7482	if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7483	if (repeat_min < `2`) possessive_quantifier = FALSE;
7484	}
7485
7486	/ Non-possessive quantifier /
7487
7488	else *ketcode = OP_KETRMAX + repeat_type;
7489	}
7490	}
7491	}
7492	break;
7493
7494	/ If previous was a character type match (\d or similar), abolish it and*
7495	create a suitable repeat item. The code is shared with single-character
7496	repeats by setting op_type to add a suitable offset into repeat_type.
7497	Note the the Unicode property types will be present only when
7498	SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7499	here because it just makes it horribly messy. /*
7500
7501	default:
7502	if (op_previous >= OP_EODN) / Not a character type - internal error /
7503	{
7504	*errorcodeptr = ERR10;
7505	return `0`;
7506	}
7507	else
7508	{
7509	int prop_type, prop_value;
7510	PCRE2_UCHAR *oldcode;
7511
7512	if (repeat_max == `1` && repeat_min == `1`) goto END_REPEAT;
7513
7514	op_type = OP_TYPESTAR - OP_STAR; / Use type opcodes /
7515	mclength = `0`; / Not a character /
7516
7517	if (op_previous == OP_PROP \|\| op_previous == OP_NOTPROP)
7518	{
7519	prop_type = previous[`1`];
7520	prop_value = previous[`2`];
7521	}
7522	else
7523	{
7524	/ Come here from just above with a character in mcbuffer/mclength. /
7525	OUTPUT_SINGLE_REPEAT:
7526	prop_type = prop_value = -`1`;
7527	}
7528
7529	/ At this point, if prop_type == prop_value == -1 we either have a*
7530	character in mcbuffer when mclength is greater than zero, or we have
7531	mclength zero, in which case there is a non-property character type in
7532	op_previous. If prop_type/value are not negative, we have a property
7533	character type in op_previous. /*
7534
7535	oldcode = code; / Save where we were /
7536	code = previous; / Usually overwrite previous item /
7537
7538	/ If the maximum is zero then the minimum must also be zero; Perl allows*
7539	this case, so we do too - by simply omitting the item altogether. /*
7540
7541	if (repeat_max == `0`) goto END_REPEAT;
7542
7543	/ Combine the op_type with the repeat_type /
7544
7545	repeat_type += op_type;
7546
7547	/ A minimum of zero is handled either as the special case * or ?, or as*
7548	an UPTO, with the maximum given. /*
7549
7550	if (repeat_min == `0`)
7551	{
7552	if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7553	else if (repeat_max == `1`) *code++ = OP_QUERY + repeat_type;
7554	else
7555	{
7556	*code++ = OP_UPTO + repeat_type;
7557	PUT2INC(code, `0`, repeat_max);
7558	}
7559	}
7560
7561	/ A repeat minimum of 1 is optimized into some special cases. If the*
7562	maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7563	left in place and, if the maximum is greater than 1, we use OP_UPTO with
7564	one less than the maximum. /*
7565
7566	else if (repeat_min == `1`)
7567	{
7568	if (repeat_max == REPEAT_UNLIMITED)
7569	*code++ = OP_PLUS + repeat_type;
7570	else
7571	{
7572	code = oldcode; / Leave previous item in place /
7573	if (repeat_max == `1`) goto END_REPEAT;
7574	*code++ = OP_UPTO + repeat_type;
7575	PUT2INC(code, `0`, repeat_max - `1`);
7576	}
7577	}
7578
7579	/ The case {n,n} is just an EXACT, while the general case {n,m} is*
7580	handled as an EXACT followed by an UPTO or STAR or QUERY. /*
7581
7582	else
7583	{
7584	code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type /
7585	PUT2INC(code, `0`, repeat_min);
7586
7587	/ Unless repeat_max equals repeat_min, fill in the data for EXACT,*
7588	and then generate the second opcode. For a repeated Unicode property
7589	match, there are two extra values that define the required property,
7590	and mclength is set zero to indicate this. /*
7591
7592	if (repeat_max != repeat_min)
7593	{
7594	if (mclength > `0`)
7595	{
7596	memcpy(code, mcbuffer, CU2BYTES(mclength));
7597	code += mclength;
7598	}
7599	else
7600	{
7601	*code++ = op_previous;
7602	if (prop_type >= `0`)
7603	{
7604	*code++ = prop_type;
7605	*code++ = prop_value;
7606	}
7607	}
7608
7609	/ Now set up the following opcode /
7610
7611	if (repeat_max == REPEAT_UNLIMITED)
7612	*code++ = OP_STAR + repeat_type;
7613	else
7614	{
7615	repeat_max -= repeat_min;
7616	if (repeat_max == `1`)
7617	{
7618	*code++ = OP_QUERY + repeat_type;
7619	}
7620	else
7621	{
7622	*code++ = OP_UPTO + repeat_type;
7623	PUT2INC(code, `0`, repeat_max);
7624	}
7625	}
7626	}
7627	}
7628
7629	/ Fill in the character or character type for the final opcode. /
7630
7631	if (mclength > `0`)
7632	{
7633	memcpy(code, mcbuffer, CU2BYTES(mclength));
7634	code += mclength;
7635	}
7636	else
7637	{
7638	*code++ = op_previous;
7639	if (prop_type >= `0`)
7640	{
7641	*code++ = prop_type;
7642	*code++ = prop_value;
7643	}
7644	}
7645	}
7646	break;
7647	} / End of switch on different op_previous values /
7648
7649
7650	/ If the character following a repeat is '+', possessive_quantifier is*
7651	TRUE. For some opcodes, there are special alternative opcodes for this
7652	case. For anything else, we wrap the entire repeated item inside OP_ONCE
7653	brackets. Logically, the '+' notation is just syntactic sugar, taken from
7654	Sun's Java package, but the special opcodes can optimize it.
7655
7656	Some (but not all) possessively repeated subpatterns have already been
7657	completely handled in the code just above. For them, possessive_quantifier
7658	is always FALSE at this stage. Note that the repeated item starts at
7659	tempcode, not at previous, which might be the first part of a string whose
7660	(former) last char we repeated. /*
7661
7662	if (possessive_quantifier)
7663	{
7664	int len;
7665
7666	/ Possessifying an EXACT quantifier has no effect, so we can ignore it.*
7667	However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7668	{5,}, or {5,10}). We skip over an EXACT item; if the length of what
7669	remains is greater than zero, there's a further opcode that can be
7670	handled. If not, do nothing, leaving the EXACT alone. /*
7671
7672	switch(*tempcode)
7673	{
7674	case OP_TYPEEXACT:
7675	tempcode += PRIV(OP_lengths)[*tempcode] +
7676	((tempcode[`1` + IMM2_SIZE] == OP_PROP
7677	\|\| tempcode[`1` + IMM2_SIZE] == OP_NOTPROP)? `2` : `0`);
7678	break;
7679
7680	/ CHAR opcodes are used for exacts whose count is 1. /
7681
7682	case OP_CHAR:
7683	case OP_CHARI:
7684	case OP_NOT:
7685	case OP_NOTI:
7686	case OP_EXACT:
7687	case OP_EXACTI:
7688	case OP_NOTEXACT:
7689	case OP_NOTEXACTI:
7690	tempcode += PRIV(OP_lengths)[*tempcode];
7691	#ifdef SUPPORT_UNICODE
7692	if (utf && HAS_EXTRALEN(tempcode[-`1`]))
7693	tempcode += GET_EXTRALEN(tempcode[-`1`]);
7694	#endif
7695	break;
7696
7697	/ For the class opcodes, the repeat operator appears at the end;*
7698	adjust tempcode to point to it. /*
7699
7700	case OP_CLASS:
7701	case OP_NCLASS:
7702	tempcode += `1` + `32`/sizeof(PCRE2_UCHAR);
7703	break;
7704
7705	#ifdef SUPPORT_WIDE_CHARS
7706	case OP_XCLASS:
7707	tempcode += GET(tempcode, `1`);
7708	break;
7709	#endif
7710	}
7711
7712	/ If tempcode is equal to code (which points to the end of the repeated*
7713	item), it means we have skipped an EXACT item but there is no following
7714	QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7715	all other cases, tempcode will be pointing to the repeat opcode, and will
7716	be less than code, so the value of len will be greater than 0. /*
7717
7718	len = (int)(code - tempcode);
7719	if (len > `0`)
7720	{
7721	unsigned int repcode = *tempcode;
7722
7723	/ There is a table for possessifying opcodes, all of which are less*
7724	than OP_CALLOUT. A zero entry means there is no possessified version.
7725	*/
7726
7727	if (repcode < OP_CALLOUT && opcode_possessify[repcode] > `0`)
7728	*tempcode = opcode_possessify[repcode];
7729
7730	/ For opcode without a special possessified version, wrap the item in*
7731	ONCE brackets. /*
7732
7733	else
7734	{
7735	(void)memmove(tempcode + `1` + LINK_SIZE, tempcode, CU2BYTES(len));
7736	code += `1` + LINK_SIZE;
7737	len += `1` + LINK_SIZE;
7738	tempcode[`0`] = OP_ONCE;
7739	*code++ = OP_KET;
7740	PUTINC(code, `0`, len);
7741	PUT(tempcode, `1`, len);
7742	}
7743	}
7744	}
7745
7746	/ We set the "follows varying string" flag for subsequently encountered*
7747	reqcus if it isn't already set and we have just passed a varying length
7748	item. /*
7749
7750	END_REPEAT:
7751	cb->req_varyopt \|= reqvary;
7752	break;
7753
7754
7755	/ ===================================================================/
7756	/ Handle a 32-bit data character with a value greater than META_END. /
7757
7758	case META_BIGVALUE:
7759	pptr++;
7760	goto NORMAL_CHAR;
7761
7762
7763	/ ===============================================================/
7764	/ Handle a back reference by number, which is the meta argument. The*
7765	pattern offsets for back references to group numbers less than 10 are held
7766	in a special vector, to avoid using more than two parsed pattern elements
7767	in 64-bit environments. We only need the offset to the first occurrence,
7768	because if that doesn't fail, subsequent ones will also be OK. /*
7769
7770	case META_BACKREF:
7771	if (meta_arg < `10`) offset = cb->small_ref_offset[meta_arg];
7772	else GETPLUSOFFSET(offset, pptr);
7773
7774	if (meta_arg > cb->bracount)
7775	{
7776	cb->erroroffset = offset;
7777	errorcodeptr = ERR15; /* Non-existent subpattern /
7778	return `0`;
7779	}
7780
7781	/ Come here from named backref handling when the reference is to a*
7782	single group (that is, not to a duplicated name). The back reference
7783	data will have already been updated. We must disable firstcu if not
7784	set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
7785	later. /*
7786
7787	HANDLE_SINGLE_REFERENCE:
7788	if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
7789	*code++ = ((options & PCRE2_CASELESS) != `0`)? OP_REFI : OP_REF;
7790	PUT2INC(code, `0`, meta_arg);
7791
7792	/ Update the map of back references, and keep the highest one. We*
7793	could do this in parse_regex() for numerical back references, but not
7794	for named back references, because we don't know the numbers to which
7795	named back references refer. So we do it all in this function. /*
7796
7797	cb->backref_map \|= (meta_arg < `32`)? (`1u` << meta_arg) : `1`;
7798	if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
7799	break;
7800
7801
7802	/ ===============================================================/
7803	/ Handle recursion by inserting the number of the called group (which is*
7804	the meta argument) after OP_RECURSE. At the end of compiling the pattern is
7805	scanned and these numbers are replaced by offsets within the pattern. It is
7806	done like this to avoid problems with forward references and adjusting
7807	offsets when groups are duplicated and moved (as discovered in previous
7808	implementations). Note that a recursion does not have a set first
7809	character. /*
7810
7811	case META_RECURSE:
7812	GETPLUSOFFSET(offset, pptr);
7813	if (meta_arg > cb->bracount)
7814	{
7815	cb->erroroffset = offset;
7816	errorcodeptr = ERR15; /* Non-existent subpattern /
7817	return `0`;
7818	}
7819	HANDLE_NUMERICAL_RECURSION:
7820	*code = OP_RECURSE;
7821	PUT(code, `1`, meta_arg);
7822	code += `1` + LINK_SIZE;
7823	groupsetfirstcu = FALSE;
7824	cb->had_recurse = TRUE;
7825	if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7826	zerofirstcu = firstcu;
7827	zerofirstcuflags = firstcuflags;
7828	break;
7829
7830
7831	/ ===============================================================/
7832	/ Handle capturing parentheses; the number is the meta argument. /
7833
7834	case META_CAPTURE:
7835	bravalue = OP_CBRA;
7836	skipunits = IMM2_SIZE;
7837	PUT2(code, `1`+LINK_SIZE, meta_arg);
7838	cb->lastcapture = meta_arg;
7839	goto GROUP_PROCESS_NOTE_EMPTY;
7840
7841
7842	/ ===============================================================/
7843	/ Handle escape sequence items. For ones like \d, the ESC_values are*
7844	arranged to be the same as the corresponding OP_values in the default case
7845	when PCRE2_UCP is not set (which is the only case in which they will appear
7846	here).
7847
7848	Note: \Q and \E are never seen here, as they were dealt with in
7849	parse_pattern(). Neither are numerical back references or recursions, which
7850	were turned into META_BACKREF or META_RECURSE items, respectively. \k and
7851	\g, when followed by names, are turned into META_BACKREF_BYNAME or
7852	META_RECURSE_BYNAME. /*
7853
7854	case META_ESCAPE:
7855
7856	/ We can test for escape sequences that consume a character because their*
7857	values lie between ESC_b and ESC_Z; this may have to change if any new ones
7858	are ever created. For these sequences, we disable the setting of a first
7859	character if it hasn't already been set. /*
7860
7861	if (meta_arg > ESC_b && meta_arg < ESC_Z)
7862	{
7863	matched_char = TRUE;
7864	if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7865	}
7866
7867	/ Set values to reset to if this is followed by a zero repeat. /
7868
7869	zerofirstcu = firstcu;
7870	zerofirstcuflags = firstcuflags;
7871	zeroreqcu = reqcu;
7872	zeroreqcuflags = reqcuflags;
7873
7874	/ If Unicode is not supported, \P and \p are not allowed and are*
7875	faulted at parse time, so will never appear here. /*
7876
7877	#ifdef SUPPORT_UNICODE
7878	if (meta_arg == ESC_P \|\| meta_arg == ESC_p)
7879	{
7880	uint32_t ptype = *(++pptr) >> `16`;
7881	uint32_t pdata = *pptr & `0xffff`;
7882
7883	/ The special case of \p{Any} is compiled to OP_ALLANY so as to benefit*
7884	from the auto-anchoring code. /*
7885
7886	if (meta_arg == ESC_p && ptype == PT_ANY)
7887	{
7888	*code++ = OP_ALLANY;
7889	}
7890	else
7891	{
7892	*code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
7893	*code++ = ptype;
7894	*code++ = pdata;
7895	}
7896	break; / End META_ESCAPE /
7897	}
7898	#endif
7899
7900	/ \K is forbidden in lookarounds since 10.38 because that's what Perl has*
7901	done. However, there's an option, in case anyone was relying on it. /*
7902
7903	if (cb->assert_depth > `0` && meta_arg == ESC_K &&
7904	(cb->cx->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == `0`)
7905	{
7906	*errorcodeptr = ERR99;
7907	return `0`;
7908	}
7909
7910	/ For the rest (including \X when Unicode is supported - if not it's*
7911	faulted at parse time), the OP value is the escape value when PCRE2_UCP is
7912	not set; if it is set, these escapes do not show up here because they are
7913	converted into Unicode property tests in parse_regex(). Note that \b and \B
7914	do a one-character lookbehind, and \A also behaves as if it does. /*
7915
7916	if (meta_arg == ESC_C) cb->external_flags \|= PCRE2_HASBKC; / Record /
7917	if ((meta_arg == ESC_b \|\| meta_arg == ESC_B \|\| meta_arg == ESC_A) &&
7918	cb->max_lookbehind == `0`)
7919	cb->max_lookbehind = `1`;
7920
7921	/ In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY*
7922	instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. /*
7923
7924	#if PCRE2_CODE_UNIT_WIDTH == 32
7925	*code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7926	#else
7927	*code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7928	#endif
7929	break; / End META_ESCAPE /
7930
7931
7932	/ ===================================================================/
7933	/ Handle an unrecognized meta value. A parsed pattern value less than*
7934	META_END is a literal. Otherwise we have a problem. /*
7935
7936	default:
7937	if (meta >= META_END)
7938	{
7939	#ifdef DEBUG_SHOW_PARSED
7940	fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
7941	#endif
7942	errorcodeptr = ERR89; /* Internal error - unrecognized. /
7943	return `0`;
7944	}
7945
7946	/ Handle a literal character. We come here by goto in the case of a*
7947	32-bit, non-UTF character whose value is greater than META_END. /*
7948
7949	NORMAL_CHAR:
7950	meta = pptr; /* Get the full 32 bits /
7951	NORMAL_CHAR_SET: / Character is already in meta /
7952	matched_char = TRUE;
7953
7954	/ For caseless UTF or UCP mode, check whether this character has more than*
7955	one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
7956	*/
7957
7958	#ifdef SUPPORT_UNICODE
7959	if ((utf\|\|ucp) && (options & PCRE2_CASELESS) != `0`)
7960	{
7961	uint32_t caseset = UCD_CASESET(meta);
7962	if (caseset != `0`)
7963	{
7964	*code++ = OP_PROP;
7965	*code++ = PT_CLIST;
7966	*code++ = caseset;
7967	if (firstcuflags == REQ_UNSET)
7968	firstcuflags = zerofirstcuflags = REQ_NONE;
7969	break; / End handling this meta item /
7970	}
7971	}
7972	#endif
7973
7974	/ Caseful matches, or caseless and not one of the multicase characters. We*
7975	come here by goto in the case of a positive class that contains only
7976	case-partners of a character with just two cases; matched_char has already
7977	been set TRUE and options fudged if necessary. /*
7978
7979	CLASS_CASELESS_CHAR:
7980
7981	/ Get the character's code units into mcbuffer, with the length in*
7982	mclength. When not in UTF mode, the length is always 1. /*
7983
7984	#ifdef SUPPORT_UNICODE
7985	if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
7986	#endif
7987	{
7988	mclength = `1`;
7989	mcbuffer[`0`] = meta;
7990	}
7991
7992	/ Generate the appropriate code /
7993
7994	*code++ = ((options & PCRE2_CASELESS) != `0`)? OP_CHARI : OP_CHAR;
7995	memcpy(code, mcbuffer, CU2BYTES(mclength));
7996	code += mclength;
7997
7998	/ Remember if \r or \n were seen /
7999
8000	if (mcbuffer[`0`] == CHAR_CR \|\| mcbuffer[`0`] == CHAR_NL)
8001	cb->external_flags \|= PCRE2_HASCRORLF;
8002
8003	/ Set the first and required code units appropriately. If no previous*
8004	first code unit, set it from this character, but revert to none on a zero
8005	repeat. Otherwise, leave the firstcu value alone, and don't change it on
8006	a zero repeat. /*
8007
8008	if (firstcuflags == REQ_UNSET)
8009	{
8010	zerofirstcuflags = REQ_NONE;
8011	zeroreqcu = reqcu;
8012	zeroreqcuflags = reqcuflags;
8013
8014	/ If the character is more than one code unit long, we can set a single*
8015	firstcu only if it is not to be matched caselessly. Multiple possible
8016	starting code units may be picked up later in the studying code. /*
8017
8018	if (mclength == `1` \|\| req_caseopt == `0`)
8019	{
8020	firstcu = mcbuffer[`0`];
8021	firstcuflags = req_caseopt;
8022	if (mclength != `1`)
8023	{
8024	reqcu = code[-`1`];
8025	reqcuflags = cb->req_varyopt;
8026	}
8027	}
8028	else firstcuflags = reqcuflags = REQ_NONE;
8029	}
8030
8031	/ firstcu was previously set; we can set reqcu only if the length is*
8032	1 or the matching is caseful. /*
8033
8034	else
8035	{
8036	zerofirstcu = firstcu;
8037	zerofirstcuflags = firstcuflags;
8038	zeroreqcu = reqcu;
8039	zeroreqcuflags = reqcuflags;
8040	if (mclength == `1` \|\| req_caseopt == `0`)
8041	{
8042	reqcu = code[-`1`];
8043	reqcuflags = req_caseopt \| cb->req_varyopt;
8044	}
8045	}
8046
8047	/ If caselessness was temporarily instated, reset it. /
8048
8049	if (reset_caseful)
8050	{
8051	options &= ~PCRE2_CASELESS;
8052	req_caseopt = `0`;
8053	reset_caseful = FALSE;
8054	}
8055
8056	break; / End literal character handling /
8057	} / End of big switch /
8058	} / End of big loop /
8059
8060	/ Control never reaches here. /
8061	}
8062
8063
8064
8065	/*************************************************
8066	* Compile regex: a sequence of alternatives *
8067	*************************************************/
8068
8069	/ On entry, pptr is pointing past the bracket meta, but on return it points to*
8070	the closing bracket or META_END. The code variable is pointing at the code unit
8071	into which the BRA operator has been stored. This function is used during the
8072	pre-compile phase when we are trying to find out the amount of memory needed,
8073	as well as during the real compile phase. The value of lengthptr distinguishes
8074	the two phases.
8075
8076	Arguments:
8077	options option bits, including any changes for this subpattern
8078	codeptr -> the address of the current code pointer
8079	pptrptr -> the address of the current parsed pattern pointer
8080	errorcodeptr -> pointer to error code variable
8081	skipunits skip this many code units at start (for brackets and OP_COND)
8082	firstcuptr place to put the first required code unit
8083	firstcuflagsptr place to put the first code unit flags
8084	reqcuptr place to put the last required code unit
8085	reqcuflagsptr place to put the last required code unit flags
8086	bcptr pointer to the chain of currently open branches
8087	cb points to the data block with tables pointers etc.
8088	lengthptr NULL during the real compile phase
8089	points to length accumulator during pre-compile phase
8090
8091	Returns: 0 There has been an error
8092	+1 Success, this group must match at least one character
8093	-1 Success, this group may match an empty string
8094	*/
8095
8096	static int
8097	compile_regex(uint32_t options, PCRE2_UCHAR codeptr, uint32_t pptrptr,
8098	int errorcodeptr, uint32_t skipunits, uint32_t firstcuptr,
8099	uint32_t firstcuflagsptr, uint32_t reqcuptr, uint32_t *reqcuflagsptr,
8100	branch_chain bcptr, compile_block cb, PCRE2_SIZE *lengthptr)
8101	{
8102	PCRE2_UCHAR code = codeptr;
8103	PCRE2_UCHAR *last_branch = code;
8104	PCRE2_UCHAR *start_bracket = code;
8105	BOOL lookbehind;
8106	open_capitem capitem;
8107	int capnumber = `0`;
8108	int okreturn = `1`;
8109	uint32_t pptr = pptrptr;
8110	uint32_t firstcu, reqcu;
8111	uint32_t lookbehindlength;
8112	uint32_t firstcuflags, reqcuflags;
8113	uint32_t branchfirstcu, branchreqcu;
8114	uint32_t branchfirstcuflags, branchreqcuflags;
8115	PCRE2_SIZE length;
8116	branch_chain bc;
8117
8118	/ If set, call the external function that checks for stack availability. /
8119
8120	if (cb->cx->stack_guard != NULL &&
8121	cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8122	{
8123	*errorcodeptr= ERR33;
8124	return `0`;
8125	}
8126
8127	/ Miscellaneous initialization /
8128
8129	bc.outer = bcptr;
8130	bc.current_branch = code;
8131
8132	firstcu = reqcu = `0`;
8133	firstcuflags = reqcuflags = REQ_UNSET;
8134
8135	/ Accumulate the length for use in the pre-compile phase. Start with the*
8136	length of the BRA and KET and any extra code units that are required at the
8137	beginning. We accumulate in a local variable to save frequent testing of
8138	lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8139	start and end of each alternative, because compiled items are discarded during
8140	the pre-compile phase so that the workspace is not exceeded. /*
8141
8142	length = `2` + `2`*LINK_SIZE + skipunits;
8143
8144	/ Remember if this is a lookbehind assertion, and if it is, save its length*
8145	and skip over the pattern offset. /*
8146
8147	lookbehind = *code == OP_ASSERTBACK \|\|
8148	*code == OP_ASSERTBACK_NOT \|\|
8149	*code == OP_ASSERTBACK_NA;
8150
8151	if (lookbehind)
8152	{
8153	lookbehindlength = META_DATA(pptr[-`1`]);
8154	pptr += SIZEOFFSET;
8155	}
8156	else lookbehindlength = `0`;
8157
8158	/ If this is a capturing subpattern, add to the chain of open capturing items*
8159	so that we can detect them if (ACCEPT) is encountered. Note that only OP_CBRA*
8160	need be tested here; changing this opcode to one of its variants, e.g.
8161	OP_SCBRAPOS, happens later, after the group has been compiled. /*
8162
8163	if (*code == OP_CBRA)
8164	{
8165	capnumber = GET2(code, `1` + LINK_SIZE);
8166	capitem.number = capnumber;
8167	capitem.next = cb->open_caps;
8168	capitem.assert_depth = cb->assert_depth;
8169	cb->open_caps = &capitem;
8170	}
8171
8172	/ Offset is set zero to mark that this bracket is still open /
8173
8174	PUT(code, `1`, `0`);
8175	code += `1` + LINK_SIZE + skipunits;
8176
8177	/ Loop for each alternative branch /
8178
8179	for (;;)
8180	{
8181	int branch_return;
8182
8183	/ Insert OP_REVERSE if this is as lookbehind assertion. /
8184
8185	if (lookbehind && lookbehindlength > `0`)
8186	{
8187	*code++ = OP_REVERSE;
8188	PUTINC(code, `0`, lookbehindlength);
8189	length += `1` + LINK_SIZE;
8190	}
8191
8192	/ Now compile the branch; in the pre-compile phase its length gets added*
8193	into the length. /*
8194
8195	if ((branch_return =
8196	compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
8197	&branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
8198	cb, (lengthptr == NULL)? NULL : &length)) == `0`)
8199	return `0`;
8200
8201	/ If a branch can match an empty string, so can the whole group. /
8202
8203	if (branch_return < `0`) okreturn = -`1`;
8204
8205	/ In the real compile phase, there is some post-processing to be done. /
8206
8207	if (lengthptr == NULL)
8208	{
8209	/ If this is the first branch, the firstcu and reqcu values for the*
8210	branch become the values for the regex. /*
8211
8212	if (*last_branch != OP_ALT)
8213	{
8214	firstcu = branchfirstcu;
8215	firstcuflags = branchfirstcuflags;
8216	reqcu = branchreqcu;
8217	reqcuflags = branchreqcuflags;
8218	}
8219
8220	/ If this is not the first branch, the first char and reqcu have to*
8221	match the values from all the previous branches, except that if the
8222	previous value for reqcu didn't have REQ_VARY set, it can still match,
8223	and we set REQ_VARY for the group from this branch's value. /*
8224
8225	else
8226	{
8227	/ If we previously had a firstcu, but it doesn't match the new branch,*
8228	we have to abandon the firstcu for the regex, but if there was
8229	previously no reqcu, it takes on the value of the old firstcu. /*
8230
8231	if (firstcuflags != branchfirstcuflags \|\| firstcu != branchfirstcu)
8232	{
8233	if (firstcuflags < REQ_NONE)
8234	{
8235	if (reqcuflags >= REQ_NONE)
8236	{
8237	reqcu = firstcu;
8238	reqcuflags = firstcuflags;
8239	}
8240	}
8241	firstcuflags = REQ_NONE;
8242	}
8243
8244	/ If we (now or from before) have no firstcu, a firstcu from the*
8245	branch becomes a reqcu if there isn't a branch reqcu. /*
8246
8247	if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8248	branchreqcuflags >= REQ_NONE)
8249	{
8250	branchreqcu = branchfirstcu;
8251	branchreqcuflags = branchfirstcuflags;
8252	}
8253
8254	/ Now ensure that the reqcus match /
8255
8256	if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) \|\|
8257	reqcu != branchreqcu)
8258	reqcuflags = REQ_NONE;
8259	else
8260	{
8261	reqcu = branchreqcu;
8262	reqcuflags \|= branchreqcuflags; / To "or" REQ_VARY if present /
8263	}
8264	}
8265	}
8266
8267	/ Handle reaching the end of the expression, either ')' or end of pattern.*
8268	In the real compile phase, go back through the alternative branches and
8269	reverse the chain of offsets, with the field in the BRA item now becoming an
8270	offset to the first alternative. If there are no alternatives, it points to
8271	the end of the group. The length in the terminating ket is always the length
8272	of the whole bracketed item. Return leaving the pointer at the terminating
8273	char. /*
8274
8275	if (META_CODE(*pptr) != META_ALT)
8276	{
8277	if (lengthptr == NULL)
8278	{
8279	PCRE2_SIZE branch_length = code - last_branch;
8280	do
8281	{
8282	PCRE2_SIZE prev_length = GET(last_branch, `1`);
8283	PUT(last_branch, `1`, branch_length);
8284	branch_length = prev_length;
8285	last_branch -= branch_length;
8286	}
8287	while (branch_length > `0`);
8288	}
8289
8290	/ Fill in the ket /
8291
8292	*code = OP_KET;
8293	PUT(code, `1`, (int)(code - start_bracket));
8294	code += `1` + LINK_SIZE;
8295
8296	/ If it was a capturing subpattern, remove the block from the chain. /
8297
8298	if (capnumber > `0`) cb->open_caps = cb->open_caps->next;
8299
8300	/ Set values to pass back /
8301
8302	*codeptr = code;
8303	*pptrptr = pptr;
8304	*firstcuptr = firstcu;
8305	*firstcuflagsptr = firstcuflags;
8306	*reqcuptr = reqcu;
8307	*reqcuflagsptr = reqcuflags;
8308	if (lengthptr != NULL)
8309	{
8310	if (OFLOW_MAX - *lengthptr < length)
8311	{
8312	*errorcodeptr = ERR20;
8313	return `0`;
8314	}
8315	*lengthptr += length;
8316	}
8317	return okreturn;
8318	}
8319
8320	/ Another branch follows. In the pre-compile phase, we can move the code*
8321	pointer back to where it was for the start of the first branch. (That is,
8322	pretend that each branch is the only one.)
8323
8324	In the real compile phase, insert an ALT node. Its length field points back
8325	to the previous branch while the bracket remains open. At the end the chain
8326	is reversed. It's done like this so that the start of the bracket has a
8327	zero offset until it is closed, making it possible to detect recursion. /*
8328
8329	if (lengthptr != NULL)
8330	{
8331	code = *codeptr + `1` + LINK_SIZE + skipunits;
8332	length += `1` + LINK_SIZE;
8333	}
8334	else
8335	{
8336	*code = OP_ALT;
8337	PUT(code, `1`, (int)(code - last_branch));
8338	bc.current_branch = last_branch = code;
8339	code += `1` + LINK_SIZE;
8340	}
8341
8342	/ Set the lookbehind length (if not in a lookbehind the value will be zero)*
8343	and then advance past the vertical bar. /*
8344
8345	lookbehindlength = META_DATA(*pptr);
8346	pptr++;
8347	}
8348	/ Control never reaches here /
8349	}
8350
8351
8352
8353	/*************************************************
8354	* Check for anchored pattern *
8355	*************************************************/
8356
8357	/ Try to find out if this is an anchored regular expression. Consider each*
8358	alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8359	all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8360	it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8361	be found, because ^ generates OP_CIRCM in that mode.
8362
8363	We can also consider a regex to be anchored if OP_SOM starts all its branches.
8364	This is the code for \G, which means "match at start of match position, taking
8365	into account the match offset".
8366
8367	A branch is also implicitly anchored if it starts with . and DOTALL is set,*
8368	because that will try the rest of the pattern at all possible matching points,
8369	so there is no point trying again.... er ....
8370
8371	.... except when the . appears inside capturing parentheses, and there is a*
8372	subsequent back reference to those parentheses. We haven't enough information
8373	to catch that case precisely.
8374
8375	At first, the best we could do was to detect when . was in capturing brackets*
8376	and the highest back reference was greater than or equal to that level.
8377	However, by keeping a bitmap of the first 31 back references, we can catch some
8378	of the more common cases more precisely.
8379
8380	... A second exception is when the . appears inside an atomic group, because*
8381	this prevents the number of characters it matches from being adjusted.
8382
8383	Arguments:
8384	code points to start of the compiled pattern
8385	bracket_map a bitmap of which brackets we are inside while testing; this
8386	handles up to substring 31; after that we just have to take
8387	the less precise approach
8388	cb points to the compile data block
8389	atomcount atomic group level
8390	inassert TRUE if in an assertion
8391
8392	Returns: TRUE or FALSE
8393	*/
8394
8395	static BOOL
8396	is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8397	int atomcount, BOOL inassert)
8398	{
8399	do {
8400	PCRE2_SPTR scode = first_significant_code(
8401	code + PRIV(OP_lengths)[*code], FALSE);
8402	int op = *scode;
8403
8404	/ Non-capturing brackets /
8405
8406	if (op == OP_BRA \|\| op == OP_BRAPOS \|\|
8407	op == OP_SBRA \|\| op == OP_SBRAPOS)
8408	{
8409	if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8410	return FALSE;
8411	}
8412
8413	/ Capturing brackets /
8414
8415	else if (op == OP_CBRA \|\| op == OP_CBRAPOS \|\|
8416	op == OP_SCBRA \|\| op == OP_SCBRAPOS)
8417	{
8418	int n = GET2(scode, `1`+LINK_SIZE);
8419	uint32_t new_map = bracket_map \| ((n < `32`)? (`1u` << n) : `1`);
8420	if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8421	}
8422
8423	/ Positive forward assertion /
8424
8425	else if (op == OP_ASSERT \|\| op == OP_ASSERT_NA)
8426	{
8427	if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8428	}
8429
8430	/ Condition. If there is no second branch, it can't be anchored. /
8431
8432	else if (op == OP_COND \|\| op == OP_SCOND)
8433	{
8434	if (scode[GET(scode,`1`)] != OP_ALT) return FALSE;
8435	if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8436	return FALSE;
8437	}
8438
8439	/ Atomic groups /
8440
8441	else if (op == OP_ONCE)
8442	{
8443	if (!is_anchored(scode, bracket_map, cb, atomcount + `1`, inassert))
8444	return FALSE;
8445	}
8446
8447	/ .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and*
8448	it isn't in brackets that are or may be referenced or inside an atomic
8449	group or an assertion. Also the pattern must not contain PRUNE or SKIP,
8450	because these break the feature. Consider, for example, /(?s).?(PRUNE)b/
8451	with the subject "aab", which matches "b", i.e. not at the start of a line.
8452	There is also an option that disables auto-anchoring. /*
8453
8454	else if ((op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR \|\|
8455	op == OP_TYPEPOSSTAR))
8456	{
8457	if (scode[`1`] != OP_ALLANY \|\| (bracket_map & cb->backref_map) != `0` \|\|
8458	atomcount > `0` \|\| cb->had_pruneorskip \|\| inassert \|\|
8459	(cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != `0`)
8460	return FALSE;
8461	}
8462
8463	/ Check for explicit anchoring /
8464
8465	else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8466
8467	code += GET(code, `1`);
8468	}
8469	while (code == OP_ALT); /* Loop for each alternative /
8470	return TRUE;
8471	}
8472
8473
8474
8475	/*************************************************
8476	* Check for starting with ^ or .* *
8477	*************************************************/
8478
8479	/ This is called to find out if every branch starts with ^ or .* so that*
8480	"first char" processing can be done to speed things up in multiline
8481	matching and for non-DOTALL patterns that start with . (which must start at*
8482	the beginning or after \n). As in the case of is_anchored() (see above), we
8483	have to take account of back references to capturing brackets that contain .*
8484	because in that case we can't make the assumption. Also, the appearance of .*
8485	inside atomic brackets or in an assertion, or in a pattern that contains PRUNE*
8486	or SKIP does not count, because once again the assumption no longer holds.*
8487
8488	Arguments:
8489	code points to start of the compiled pattern or a group
8490	bracket_map a bitmap of which brackets we are inside while testing; this
8491	handles up to substring 31; after that we just have to take
8492	the less precise approach
8493	cb points to the compile data
8494	atomcount atomic group level
8495	inassert TRUE if in an assertion
8496
8497	Returns: TRUE or FALSE
8498	*/
8499
8500	static BOOL
8501	is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8502	int atomcount, BOOL inassert)
8503	{
8504	do {
8505	PCRE2_SPTR scode = first_significant_code(
8506	code + PRIV(OP_lengths)[*code], FALSE);
8507	int op = *scode;
8508
8509	/ If we are at the start of a conditional assertion group, both the*
8510	conditional assertion and* what follows the condition must satisfy the test*
8511	for start of line. Other kinds of condition fail. Note that there may be an
8512	auto-callout at the start of a condition. /*
8513
8514	if (op == OP_COND)
8515	{
8516	scode += `1` + LINK_SIZE;
8517
8518	if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8519	else if (scode == OP_CALLOUT_STR) scode += GET(scode, `1` + `2`LINK_SIZE);
8520
8521	switch (*scode)
8522	{
8523	case OP_CREF:
8524	case OP_DNCREF:
8525	case OP_RREF:
8526	case OP_DNRREF:
8527	case OP_FAIL:
8528	case OP_FALSE:
8529	case OP_TRUE:
8530	return FALSE;
8531
8532	default: / Assertion /
8533	if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8534	do scode += GET(scode, `1`); while (*scode == OP_ALT);
8535	scode += `1` + LINK_SIZE;
8536	break;
8537	}
8538	scode = first_significant_code(scode, FALSE);
8539	op = *scode;
8540	}
8541
8542	/ Non-capturing brackets /
8543
8544	if (op == OP_BRA \|\| op == OP_BRAPOS \|\|
8545	op == OP_SBRA \|\| op == OP_SBRAPOS)
8546	{
8547	if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8548	return FALSE;
8549	}
8550
8551	/ Capturing brackets /
8552
8553	else if (op == OP_CBRA \|\| op == OP_CBRAPOS \|\|
8554	op == OP_SCBRA \|\| op == OP_SCBRAPOS)
8555	{
8556	int n = GET2(scode, `1`+LINK_SIZE);
8557	unsigned int new_map = bracket_map \| ((n < `32`)? (`1u` << n) : `1`);
8558	if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8559	}
8560
8561	/ Positive forward assertions /
8562
8563	else if (op == OP_ASSERT \|\| op == OP_ASSERT_NA)
8564	{
8565	if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8566	return FALSE;
8567	}
8568
8569	/ Atomic brackets /
8570
8571	else if (op == OP_ONCE)
8572	{
8573	if (!is_startline(scode, bracket_map, cb, atomcount + `1`, inassert))
8574	return FALSE;
8575	}
8576
8577	/ .* means "start at start or after \n" if it isn't in atomic brackets or*
8578	brackets that may be referenced or an assertion, and as long as the pattern
8579	does not contain PRUNE or SKIP, because these break the feature. Consider,
8580	for example, /.?a(PRUNE)b/ with the subject "aab", which matches "ab",
8581	i.e. not at the start of a line. There is also an option that disables this
8582	optimization. /*
8583
8584	else if (op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR \|\| op == OP_TYPEPOSSTAR)
8585	{
8586	if (scode[`1`] != OP_ANY \|\| (bracket_map & cb->backref_map) != `0` \|\|
8587	atomcount > `0` \|\| cb->had_pruneorskip \|\| inassert \|\|
8588	(cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != `0`)
8589	return FALSE;
8590	}
8591
8592	/ Check for explicit circumflex; anything else gives a FALSE result. Note*
8593	in particular that this includes atomic brackets OP_ONCE because the number
8594	of characters matched by . cannot be adjusted inside them. /
8595
8596	else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8597
8598	/ Move on to the next alternative /
8599
8600	code += GET(code, `1`);
8601	}
8602	while (code == OP_ALT); /* Loop for each alternative /
8603	return TRUE;
8604	}
8605
8606
8607
8608	/*************************************************
8609	* Scan compiled regex for recursion reference *
8610	*************************************************/
8611
8612	/ This function scans through a compiled pattern until it finds an instance of*
8613	OP_RECURSE.
8614
8615	Arguments:
8616	code points to start of expression
8617	utf TRUE in UTF mode
8618
8619	Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
8620	*/
8621
8622	static PCRE2_SPTR
8623	find_recurse(PCRE2_SPTR code, BOOL utf)
8624	{
8625	for (;;)
8626	{
8627	PCRE2_UCHAR c = *code;
8628	if (c == OP_END) return NULL;
8629	if (c == OP_RECURSE) return code;
8630
8631	/ XCLASS is used for classes that cannot be represented just by a bit map.*
8632	This includes negated single high-valued characters. CALLOUT_STR is used for
8633	callouts with string arguments. In both cases the length in the table is
8634	zero; the actual length is stored in the compiled code. /*
8635
8636	if (c == OP_XCLASS) code += GET(code, `1`);
8637	else if (c == OP_CALLOUT_STR) code += GET(code, `1` + `2`*LINK_SIZE);
8638
8639	/ Otherwise, we can get the item's length from the table, except that for*
8640	repeated character types, we have to test for \p and \P, which have an extra
8641	two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8642	we must add in its length. /*
8643
8644	else
8645	{
8646	switch(c)
8647	{
8648	case OP_TYPESTAR:
8649	case OP_TYPEMINSTAR:
8650	case OP_TYPEPLUS:
8651	case OP_TYPEMINPLUS:
8652	case OP_TYPEQUERY:
8653	case OP_TYPEMINQUERY:
8654	case OP_TYPEPOSSTAR:
8655	case OP_TYPEPOSPLUS:
8656	case OP_TYPEPOSQUERY:
8657	if (code[`1`] == OP_PROP \|\| code[`1`] == OP_NOTPROP) code += `2`;
8658	break;
8659
8660	case OP_TYPEPOSUPTO:
8661	case OP_TYPEUPTO:
8662	case OP_TYPEMINUPTO:
8663	case OP_TYPEEXACT:
8664	if (code[`1` + IMM2_SIZE] == OP_PROP \|\| code[`1` + IMM2_SIZE] == OP_NOTPROP)
8665	code += `2`;
8666	break;
8667
8668	case OP_MARK:
8669	case OP_COMMIT_ARG:
8670	case OP_PRUNE_ARG:
8671	case OP_SKIP_ARG:
8672	case OP_THEN_ARG:
8673	code += code[`1`];
8674	break;
8675	}
8676
8677	/ Add in the fixed length from the table /
8678
8679	code += PRIV(OP_lengths)[c];
8680
8681	/ In UTF-8 and UTF-16 modes, opcodes that are followed by a character may*
8682	be followed by a multi-unit character. The length in the table is a
8683	minimum, so we have to arrange to skip the extra units. /*
8684
8685	#ifdef MAYBE_UTF_MULTI
8686	if (utf) switch(c)
8687	{
8688	case OP_CHAR:
8689	case OP_CHARI:
8690	case OP_NOT:
8691	case OP_NOTI:
8692	case OP_EXACT:
8693	case OP_EXACTI:
8694	case OP_NOTEXACT:
8695	case OP_NOTEXACTI:
8696	case OP_UPTO:
8697	case OP_UPTOI:
8698	case OP_NOTUPTO:
8699	case OP_NOTUPTOI:
8700	case OP_MINUPTO:
8701	case OP_MINUPTOI:
8702	case OP_NOTMINUPTO:
8703	case OP_NOTMINUPTOI:
8704	case OP_POSUPTO:
8705	case OP_POSUPTOI:
8706	case OP_NOTPOSUPTO:
8707	case OP_NOTPOSUPTOI:
8708	case OP_STAR:
8709	case OP_STARI:
8710	case OP_NOTSTAR:
8711	case OP_NOTSTARI:
8712	case OP_MINSTAR:
8713	case OP_MINSTARI:
8714	case OP_NOTMINSTAR:
8715	case OP_NOTMINSTARI:
8716	case OP_POSSTAR:
8717	case OP_POSSTARI:
8718	case OP_NOTPOSSTAR:
8719	case OP_NOTPOSSTARI:
8720	case OP_PLUS:
8721	case OP_PLUSI:
8722	case OP_NOTPLUS:
8723	case OP_NOTPLUSI:
8724	case OP_MINPLUS:
8725	case OP_MINPLUSI:
8726	case OP_NOTMINPLUS:
8727	case OP_NOTMINPLUSI:
8728	case OP_POSPLUS:
8729	case OP_POSPLUSI:
8730	case OP_NOTPOSPLUS:
8731	case OP_NOTPOSPLUSI:
8732	case OP_QUERY:
8733	case OP_QUERYI:
8734	case OP_NOTQUERY:
8735	case OP_NOTQUERYI:
8736	case OP_MINQUERY:
8737	case OP_MINQUERYI:
8738	case OP_NOTMINQUERY:
8739	case OP_NOTMINQUERYI:
8740	case OP_POSQUERY:
8741	case OP_POSQUERYI:
8742	case OP_NOTPOSQUERY:
8743	case OP_NOTPOSQUERYI:
8744	if (HAS_EXTRALEN(code[-`1`])) code += GET_EXTRALEN(code[-`1`]);
8745	break;
8746	}
8747	#else
8748	(void)(utf); / Keep compiler happy by referencing function argument /
8749	#endif /* MAYBE_UTF_MULTI */
8750	}
8751	}
8752	}
8753
8754
8755
8756	/*************************************************
8757	* Check for asserted fixed first code unit *
8758	*************************************************/
8759
8760	/ During compilation, the "first code unit" settings from forward assertions*
8761	are discarded, because they can cause conflicts with actual literals that
8762	follow. However, if we end up without a first code unit setting for an
8763	unanchored pattern, it is worth scanning the regex to see if there is an
8764	initial asserted first code unit. If all branches start with the same asserted
8765	code unit, or with a non-conditional bracket all of whose alternatives start
8766	with the same asserted code unit (recurse ad lib), then we return that code
8767	unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
8768	REQ_NONE in the flags.
8769
8770	Arguments:
8771	code points to start of compiled pattern
8772	flags points to the first code unit flags
8773	inassert non-zero if in an assertion
8774
8775	Returns: the fixed first code unit, or 0 with REQ_NONE in flags
8776	*/
8777
8778	static uint32_t
8779	find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
8780	{
8781	uint32_t c = `0`;
8782	uint32_t cflags = REQ_NONE;
8783
8784	*flags = REQ_NONE;
8785	do {
8786	uint32_t d;
8787	uint32_t dflags;
8788	int xl = (code == OP_CBRA \|\| code == OP_SCBRA \|\|
8789	code == OP_CBRAPOS \|\| code == OP_SCBRAPOS)? IMM2_SIZE:`0`;
8790	PCRE2_SPTR scode = first_significant_code(code + `1`+LINK_SIZE + xl, TRUE);
8791	PCRE2_UCHAR op = *scode;
8792
8793	switch(op)
8794	{
8795	default:
8796	return `0`;
8797
8798	case OP_BRA:
8799	case OP_BRAPOS:
8800	case OP_CBRA:
8801	case OP_SCBRA:
8802	case OP_CBRAPOS:
8803	case OP_SCBRAPOS:
8804	case OP_ASSERT:
8805	case OP_ASSERT_NA:
8806	case OP_ONCE:
8807	case OP_SCRIPT_RUN:
8808	d = find_firstassertedcu(scode, &dflags, inassert +
8809	((op == OP_ASSERT \|\| op == OP_ASSERT_NA)?`1`:`0`));
8810	if (dflags >= REQ_NONE) return `0`;
8811	if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
8812	else if (c != d \|\| cflags != dflags) return `0`;
8813	break;
8814
8815	case OP_EXACT:
8816	scode += IMM2_SIZE;
8817	/ Fall through /
8818
8819	case OP_CHAR:
8820	case OP_PLUS:
8821	case OP_MINPLUS:
8822	case OP_POSPLUS:
8823	if (inassert == `0`) return `0`;
8824	if (cflags >= REQ_NONE) { c = scode[`1`]; cflags = `0`; }
8825	else if (c != scode[`1`]) return `0`;
8826	break;
8827
8828	case OP_EXACTI:
8829	scode += IMM2_SIZE;
8830	/ Fall through /
8831
8832	case OP_CHARI:
8833	case OP_PLUSI:
8834	case OP_MINPLUSI:
8835	case OP_POSPLUSI:
8836	if (inassert == `0`) return `0`;
8837
8838	/ If the character is more than one code unit long, we cannot set its*
8839	first code unit when matching caselessly. Later scanning may pick up
8840	multiple code units. /*
8841
8842	#ifdef SUPPORT_UNICODE
8843	#if PCRE2_CODE_UNIT_WIDTH == 8
8844	if (scode[`1`] >= `0x80`) return `0`;
8845	#elif PCRE2_CODE_UNIT_WIDTH == 16
8846	if (scode[`1`] >= `0xd800` && scode[`1`] <= `0xdfff`) return `0`;
8847	#endif
8848	#endif
8849
8850	if (cflags >= REQ_NONE) { c = scode[`1`]; cflags = REQ_CASELESS; }
8851	else if (c != scode[`1`]) return `0`;
8852	break;
8853	}
8854
8855	code += GET(code, `1`);
8856	}
8857	while (*code == OP_ALT);
8858
8859	*flags = cflags;
8860	return c;
8861	}
8862
8863
8864
8865	/*************************************************
8866	* Add an entry to the name/number table *
8867	*************************************************/
8868
8869	/ This function is called between compiling passes to add an entry to the*
8870	name/number table, maintaining alphabetical order. Checking for permitted
8871	and forbidden duplicates has already been done.
8872
8873	Arguments:
8874	cb the compile data block
8875	name the name to add
8876	length the length of the name
8877	groupno the group number
8878	tablecount the count of names in the table so far
8879
8880	Returns: nothing
8881	*/
8882
8883	static void
8884	add_name_to_table(compile_block cb, PCRE2_SPTR name, int* length,
8885	unsigned int groupno, uint32_t tablecount)
8886	{
8887	uint32_t i;
8888	PCRE2_UCHAR *slot = cb->name_table;
8889
8890	for (i = `0`; i < tablecount; i++)
8891	{
8892	int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
8893	if (crc == `0` && slot[IMM2_SIZE+length] != `0`)
8894	crc = -`1`; / Current name is a substring /
8895
8896	/ Make space in the table and break the loop for an earlier name. For a*
8897	duplicate or later name, carry on. We do this for duplicates so that in the
8898	simple case (when ?(\| is not used) they are in order of their numbers. In all
8899	cases they are in the order in which they appear in the pattern. /*
8900
8901	if (crc < `0`)
8902	{
8903	(void)memmove(slot + cb->name_entry_size, slot,
8904	CU2BYTES((tablecount - i) * cb->name_entry_size));
8905	break;
8906	}
8907
8908	/ Continue the loop for a later or duplicate name /
8909
8910	slot += cb->name_entry_size;
8911	}
8912
8913	PUT2(slot, `0`, groupno);
8914	memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
8915
8916	/ Add a terminating zero and fill the rest of the slot with zeroes so that*
8917	the memory is all initialized. Otherwise valgrind moans about uninitialized
8918	memory when saving serialized compiled patterns. /*
8919
8920	memset(slot + IMM2_SIZE + length, `0`,
8921	CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
8922	}
8923
8924
8925
8926	/*************************************************
8927	* Skip in parsed pattern *
8928	*************************************************/
8929
8930	/ This function is called to skip parts of the parsed pattern when finding the*
8931	length of a lookbehind branch. It is called after (ACCEPT) and (FAIL) to find
8932	the end of the branch, it is called to skip over an internal lookaround or
8933	(DEFINE) group, and it is also called to skip to the end of a class, during
8934	which it will never encounter nested groups (but there's no need to have
8935	special code for that).
8936
8937	When called to find the end of a branch or group, pptr must point to the first
8938	meta code inside the branch, not the branch-starting code. In other cases it
8939	can point to the item that causes the function to be called.
8940
8941	Arguments:
8942	pptr current pointer to skip from
8943	skiptype PSKIP_CLASS when skipping to end of class
8944	PSKIP_ALT when META_ALT ends the skip
8945	PSKIP_KET when only META_KET ends the skip
8946
8947	Returns: new value of pptr
8948	NULL if META_END is reached - should never occur
8949	or for an unknown meta value - likewise
8950	*/
8951
8952	static uint32_t *
8953	parsed_skip(uint32_t *pptr, uint32_t skiptype)
8954	{
8955	uint32_t nestlevel = `0`;
8956
8957	for (;; pptr++)
8958	{
8959	uint32_t meta = META_CODE(*pptr);
8960
8961	switch(meta)
8962	{
8963	default: / Just skip over most items /
8964	if (meta < META_END) continue; / Literal /
8965	break;
8966
8967	/ This should never occur. /
8968
8969	case META_END:
8970	return NULL;
8971
8972	/ The data for these items is variable in length. /
8973
8974	case META_BACKREF: / Offset is present only if group >= 10 /
8975	if (META_DATA(*pptr) >= `10`) pptr += SIZEOFFSET;
8976	break;
8977
8978	case META_ESCAPE: / A few escapes are followed by data items. /
8979	switch (META_DATA(*pptr))
8980	{
8981	case ESC_P:
8982	case ESC_p:
8983	pptr += `1`;
8984	break;
8985
8986	case ESC_g:
8987	case ESC_k:
8988	pptr += `1` + SIZEOFFSET;
8989	break;
8990	}
8991	break;
8992
8993	case META_MARK: / Add the length of the name. /
8994	case META_COMMIT_ARG:
8995	case META_PRUNE_ARG:
8996	case META_SKIP_ARG:
8997	case META_THEN_ARG:
8998	pptr += pptr[`1`];
8999	break;
9000
9001	/ These are the "active" items in this loop. /
9002
9003	case META_CLASS_END:
9004	if (skiptype == PSKIP_CLASS) return pptr;
9005	break;
9006
9007	case META_ATOMIC:
9008	case META_CAPTURE:
9009	case META_COND_ASSERT:
9010	case META_COND_DEFINE:
9011	case META_COND_NAME:
9012	case META_COND_NUMBER:
9013	case META_COND_RNAME:
9014	case META_COND_RNUMBER:
9015	case META_COND_VERSION:
9016	case META_LOOKAHEAD:
9017	case META_LOOKAHEADNOT:
9018	case META_LOOKAHEAD_NA:
9019	case META_LOOKBEHIND:
9020	case META_LOOKBEHINDNOT:
9021	case META_LOOKBEHIND_NA:
9022	case META_NOCAPTURE:
9023	case META_SCRIPT_RUN:
9024	nestlevel++;
9025	break;
9026
9027	case META_ALT:
9028	if (nestlevel == `0` && skiptype == PSKIP_ALT) return pptr;
9029	break;
9030
9031	case META_KET:
9032	if (nestlevel == `0`) return pptr;
9033	nestlevel--;
9034	break;
9035	}
9036
9037	/ The extra data item length for each meta is in a table. /
9038
9039	meta = (meta >> `16`) & `0x7fff`;
9040	if (meta >= sizeof(meta_extra_lengths)) return NULL;
9041	pptr += meta_extra_lengths[meta];
9042	}
9043	/ Control never reaches here /
9044	return pptr;
9045	}
9046
9047
9048
9049	/*************************************************
9050	* Find length of a parsed group *
9051	*************************************************/
9052
9053	/ This is called for nested groups within a branch of a lookbehind whose*
9054	length is being computed. If all the branches in the nested group have the same
9055	length, that is OK. On entry, the pointer must be at the first element after
9056	the group initializing code. On exit it points to OP_KET. Caching is used to
9057	improve processing speed when the same capturing group occurs many times.
9058
9059	Arguments:
9060	pptrptr pointer to pointer in the parsed pattern
9061	isinline FALSE if a reference or recursion; TRUE for inline group
9062	errcodeptr pointer to the errorcode
9063	lcptr pointer to the loop counter
9064	group number of captured group or -1 for a non-capturing group
9065	recurses chain of recurse_check to catch mutual recursion
9066	cb pointer to the compile data
9067
9068	Returns: the group length or a negative number
9069	*/
9070
9071	static int
9072	get_grouplength(uint32_t *pptrptr, BOOL isinline, int* errcodeptr, int* *lcptr,
9073	int group, parsed_recurse_check recurses, compile_block cb)
9074	{
9075	int branchlength;
9076	int grouplength = -`1`;
9077
9078	/ The cache can be used only if there is no possibility of there being two*
9079	groups with the same number. We do not need to set the end pointer for a group
9080	that is being processed as a back reference or recursion, but we must do so for
9081	an inline group. /*
9082
9083	if (group > `0` && (cb->external_flags & PCRE2_DUPCAPUSED) == `0`)
9084	{
9085	uint32_t groupinfo = cb->groupinfo[group];
9086	if ((groupinfo & GI_NOT_FIXED_LENGTH) != `0`) return -`1`;
9087	if ((groupinfo & GI_SET_FIXED_LENGTH) != `0`)
9088	{
9089	if (isinline) pptrptr = parsed_skip(pptrptr, PSKIP_KET);
9090	return groupinfo & GI_FIXED_LENGTH_MASK;
9091	}
9092	}
9093
9094	/ Scan the group. In this case we find the end pointer of necessity. /
9095
9096	for(;;)
9097	{
9098	branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
9099	if (branchlength < `0`) goto ISNOTFIXED;
9100	if (grouplength == -`1`) grouplength = branchlength;
9101	else if (grouplength != branchlength) goto ISNOTFIXED;
9102	if (pptrptr == META_KET) break**;
9103	pptrptr += `1`; /* Skip META_ALT /
9104	}
9105
9106	if (group > `0`)
9107	cb->groupinfo[group] \|= (uint32_t)(GI_SET_FIXED_LENGTH \| grouplength);
9108	return grouplength;
9109
9110	ISNOTFIXED:
9111	if (group > `0`) cb->groupinfo[group] \|= GI_NOT_FIXED_LENGTH;
9112	return -`1`;
9113	}
9114
9115
9116
9117	/*************************************************
9118	* Find length of a parsed branch *
9119	*************************************************/
9120
9121	/ Return a fixed length for a branch in a lookbehind, giving an error if the*
9122	length is not fixed. On entry, pptrptr points to the first element inside the*
9123	branch. On exit it is set to point to the ALT or KET.
9124
9125	Arguments:
9126	pptrptr pointer to pointer in the parsed pattern
9127	errcodeptr pointer to error code
9128	lcptr pointer to loop counter
9129	recurses chain of recurse_check to catch mutual recursion
9130	cb pointer to compile block
9131
9132	Returns: the length, or a negative value on error
9133	*/
9134
9135	static int
9136	get_branchlength(uint32_t *pptrptr, int* errcodeptr, int* *lcptr,
9137	parsed_recurse_check recurses, compile_block cb)
9138	{
9139	int branchlength = `0`;
9140	int grouplength;
9141	uint32_t lastitemlength = `0`;
9142	uint32_t pptr = pptrptr;
9143	PCRE2_SIZE offset;
9144	parsed_recurse_check this_recurse;
9145
9146	/ A large and/or complex regex can take too long to process. This can happen*
9147	more often when (?\| groups are present in the pattern because their length
9148	cannot be cached. /*
9149
9150	if ((*lcptr)++ > `2000`)
9151	{
9152	errcodeptr = ERR35; /* Lookbehind is too complicated /
9153	return -`1`;
9154	}
9155
9156	/ Scan the branch, accumulating the length. /
9157
9158	for (;; pptr++)
9159	{
9160	parsed_recurse_check *r;
9161	uint32_t gptr, gptrend;
9162	uint32_t escape;
9163	uint32_t group = `0`;
9164	uint32_t itemlength = `0`;
9165
9166	if (*pptr < META_END)
9167	{
9168	itemlength = `1`;
9169	}
9170
9171	else switch (META_CODE(*pptr))
9172	{
9173	case META_KET:
9174	case META_ALT:
9175	goto EXIT;
9176
9177	/ (ACCEPT) and (FAIL) terminate the branch, but we must skip to the*
9178	actual termination. /*
9179
9180	case META_ACCEPT:
9181	case META_FAIL:
9182	pptr = parsed_skip(pptr, PSKIP_ALT);
9183	if (pptr == NULL) goto PARSED_SKIP_FAILED;
9184	goto EXIT;
9185
9186	case META_MARK:
9187	case META_COMMIT_ARG:
9188	case META_PRUNE_ARG:
9189	case META_SKIP_ARG:
9190	case META_THEN_ARG:
9191	pptr += pptr[`1`] + `1`;
9192	break;
9193
9194	case META_CIRCUMFLEX:
9195	case META_COMMIT:
9196	case META_DOLLAR:
9197	case META_PRUNE:
9198	case META_SKIP:
9199	case META_THEN:
9200	break;
9201
9202	case META_OPTIONS:
9203	pptr += `1`;
9204	break;
9205
9206	case META_BIGVALUE:
9207	itemlength = `1`;
9208	pptr += `1`;
9209	break;
9210
9211	case META_CLASS:
9212	case META_CLASS_NOT:
9213	itemlength = `1`;
9214	pptr = parsed_skip(pptr, PSKIP_CLASS);
9215	if (pptr == NULL) goto PARSED_SKIP_FAILED;
9216	break;
9217
9218	case META_CLASS_EMPTY_NOT:
9219	case META_DOT:
9220	itemlength = `1`;
9221	break;
9222
9223	case META_CALLOUT_NUMBER:
9224	pptr += `3`;
9225	break;
9226
9227	case META_CALLOUT_STRING:
9228	pptr += `3` + SIZEOFFSET;
9229	break;
9230
9231	/ Only some escapes consume a character. Of those, \R and \X are never*
9232	allowed because they might match more than character. \C is allowed only in
9233	32-bit and non-UTF 8/16-bit modes. /*
9234
9235	case META_ESCAPE:
9236	escape = META_DATA(*pptr);
9237	if (escape == ESC_R \|\| escape == ESC_X) return -`1`;
9238	if (escape > ESC_b && escape < ESC_Z)
9239	{
9240	#if PCRE2_CODE_UNIT_WIDTH != 32
9241	if ((cb->external_options & PCRE2_UTF) != `0` && escape == ESC_C)
9242	{
9243	*errcodeptr = ERR36;
9244	return -`1`;
9245	}
9246	#endif
9247	itemlength = `1`;
9248	if (escape == ESC_p \|\| escape == ESC_P) pptr++; / Skip prop data /
9249	}
9250	break;
9251
9252	/ Lookaheads do not contribute to the length of this branch, but they may*
9253	contain lookbehinds within them whose lengths need to be set. /*
9254
9255	case META_LOOKAHEAD:
9256	case META_LOOKAHEADNOT:
9257	case META_LOOKAHEAD_NA:
9258	*errcodeptr = check_lookbehinds(pptr + `1`, &pptr, recurses, cb, lcptr);
9259	if (errcodeptr != `0`) return* -`1`;
9260
9261	/ Ignore any qualifiers that follow a lookahead assertion. /
9262
9263	switch (pptr[`1`])
9264	{
9265	case META_ASTERISK:
9266	case META_ASTERISK_PLUS:
9267	case META_ASTERISK_QUERY:
9268	case META_PLUS:
9269	case META_PLUS_PLUS:
9270	case META_PLUS_QUERY:
9271	case META_QUERY:
9272	case META_QUERY_PLUS:
9273	case META_QUERY_QUERY:
9274	pptr++;
9275	break;
9276
9277	case META_MINMAX:
9278	case META_MINMAX_PLUS:
9279	case META_MINMAX_QUERY:
9280	pptr += `3`;
9281	break;
9282
9283	default:
9284	break;
9285	}
9286	break;
9287
9288	/ A nested lookbehind does not contribute any length to this lookbehind,*
9289	but must itself be checked and have its lengths set. /*
9290
9291	case META_LOOKBEHIND:
9292	case META_LOOKBEHINDNOT:
9293	case META_LOOKBEHIND_NA:
9294	if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9295	return -`1`;
9296	break;
9297
9298	/ Back references and recursions are handled by very similar code. At this*
9299	stage, the names generated in the parsing pass are available, but the main
9300	name table has not yet been created. So for the named varieties, scan the
9301	list of names in order to get the number of the first one in the pattern,
9302	and whether or not this name is duplicated. /*
9303
9304	case META_BACKREF_BYNAME:
9305	if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != `0`)
9306	goto ISNOTFIXED;
9307	/ Fall through /
9308
9309	case META_RECURSE_BYNAME:
9310	{
9311	int i;
9312	PCRE2_SPTR name;
9313	BOOL is_dupname = FALSE;
9314	named_group *ng = cb->named_groups;
9315	uint32_t meta_code = META_CODE(*pptr);
9316	uint32_t length = *(++pptr);
9317
9318	GETPLUSOFFSET(offset, pptr);
9319	name = cb->start_pattern + offset;
9320	for (i = `0`; i < cb->names_found; i++, ng++)
9321	{
9322	if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == `0`)
9323	{
9324	group = ng->number;
9325	is_dupname = ng->isdup;
9326	break;
9327	}
9328	}
9329
9330	if (group == `0`)
9331	{
9332	errcodeptr = ERR15; /* Non-existent subpattern /
9333	cb->erroroffset = offset;
9334	return -`1`;
9335	}
9336
9337	/ A numerical back reference can be fixed length if duplicate capturing*
9338	groups are not being used. A non-duplicate named back reference can also
9339	be handled. /*
9340
9341	if (meta_code == META_RECURSE_BYNAME \|\|
9342	(!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == `0`))
9343	goto RECURSE_OR_BACKREF_LENGTH; / Handle as a numbered version. /
9344	}
9345	goto ISNOTFIXED; / Duplicate name or number /
9346
9347	/ The offset values for back references < 10 are in a separate vector*
9348	because otherwise they would use more than two parsed pattern elements on
9349	64-bit systems. /*
9350
9351	case META_BACKREF:
9352	if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != `0` \|\|
9353	(cb->external_flags & PCRE2_DUPCAPUSED) != `0`)
9354	goto ISNOTFIXED;
9355	group = META_DATA(*pptr);
9356	if (group < `10`)
9357	{
9358	offset = cb->small_ref_offset[group];
9359	goto RECURSE_OR_BACKREF_LENGTH;
9360	}
9361
9362	/ Fall through /
9363	/ For groups >= 10 - picking up group twice does no harm. /
9364
9365	/ A true recursion implies not fixed length, but a subroutine call may*
9366	be OK. Back reference "recursions" are also failed. /*
9367
9368	case META_RECURSE:
9369	group = META_DATA(*pptr);
9370	GETPLUSOFFSET(offset, pptr);
9371
9372	RECURSE_OR_BACKREF_LENGTH:
9373	if (group > cb->bracount)
9374	{
9375	cb->erroroffset = offset;
9376	errcodeptr = ERR15; /* Non-existent subpattern /
9377	return -`1`;
9378	}
9379	if (group == `0`) goto ISNOTFIXED; / Local recursion /
9380	for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9381	{
9382	if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9383	else if (gptr == (META_CAPTURE \| group)) break*;
9384	}
9385
9386	/ We must start the search for the end of the group at the first meta code*
9387	inside the group. Otherwise it will be treated as an enclosed group. /*
9388
9389	gptrend = parsed_skip(gptr + `1`, PSKIP_KET);
9390	if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9391	if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; / Local recursion /
9392	for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9393	if (r != NULL) goto ISNOTFIXED; / Mutual recursion /
9394	this_recurse.prev = recurses;
9395	this_recurse.groupptr = gptr;
9396
9397	/ We do not need to know the position of the end of the group, that is,*
9398	gptr is not used after the call to get_grouplength(). Setting the second
9399	argument FALSE stops it scanning for the end when the length can be found
9400	in the cache. /*
9401
9402	gptr++;
9403	grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
9404	&this_recurse, cb);
9405	if (grouplength < `0`)
9406	{
9407	if (errcodeptr == `0`) goto* ISNOTFIXED;
9408	return -`1`; / Error already set /
9409	}
9410	itemlength = grouplength;
9411	break;
9412
9413	/ A (DEFINE) group is never obeyed inline and so it does not contribute to*
9414	the length of this branch. Skip from the following item to the next
9415	unpaired ket. /*
9416
9417	case META_COND_DEFINE:
9418	pptr = parsed_skip(pptr + `1`, PSKIP_KET);
9419	break;
9420
9421	/ Check other nested groups - advance past the initial data for each type*
9422	and then seek a fixed length with get_grouplength(). /*
9423
9424	case META_COND_NAME:
9425	case META_COND_NUMBER:
9426	case META_COND_RNAME:
9427	case META_COND_RNUMBER:
9428	pptr += `2` + SIZEOFFSET;
9429	goto CHECK_GROUP;
9430
9431	case META_COND_ASSERT:
9432	pptr += `1`;
9433	goto CHECK_GROUP;
9434
9435	case META_COND_VERSION:
9436	pptr += `4`;
9437	goto CHECK_GROUP;
9438
9439	case META_CAPTURE:
9440	group = META_DATA(*pptr);
9441	/ Fall through /
9442
9443	case META_ATOMIC:
9444	case META_NOCAPTURE:
9445	case META_SCRIPT_RUN:
9446	pptr++;
9447	CHECK_GROUP:
9448	grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
9449	recurses, cb);
9450	if (grouplength < `0`) return -`1`;
9451	itemlength = grouplength;
9452	break;
9453
9454	/ Exact repetition is OK; variable repetition is not. A repetition of zero*
9455	must subtract the length that has already been added. /*
9456
9457	case META_MINMAX:
9458	case META_MINMAX_PLUS:
9459	case META_MINMAX_QUERY:
9460	if (pptr[`1`] == pptr[`2`])
9461	{
9462	switch(pptr[`1`])
9463	{
9464	case `0`:
9465	branchlength -= lastitemlength;
9466	break;
9467
9468	case `1`:
9469	itemlength = `0`;
9470	break;
9471
9472	default: / Check for integer overflow /
9473	if (lastitemlength != `0` && / Should not occur, but just in case /
9474	INT_MAX/lastitemlength < pptr[`1`] - `1`)
9475	{
9476	errcodeptr = ERR87; /* Integer overflow; lookbehind too big /
9477	return -`1`;
9478	}
9479	itemlength = (pptr[`1`] - `1`) * lastitemlength;
9480	break;
9481	}
9482	pptr += `2`;
9483	break;
9484	}
9485	/ Fall through /
9486
9487	/ Any other item means this branch does not have a fixed length. /
9488
9489	default:
9490	ISNOTFIXED:
9491	errcodeptr = ERR25; /* Not fixed length /
9492	return -`1`;
9493	}
9494
9495	/ Add the item length to the branchlength, checking for integer overflow and*
9496	for the branch length exceeding the limit. /*
9497
9498	if (INT_MAX - branchlength < (int)itemlength \|\|
9499	(branchlength += itemlength) > LOOKBEHIND_MAX)
9500	{
9501	*errcodeptr = ERR87;
9502	return -`1`;
9503	}
9504
9505	/ Save this item length for use if the next item is a quantifier. /
9506
9507	lastitemlength = itemlength;
9508	}
9509
9510	EXIT:
9511	*pptrptr = pptr;
9512	return branchlength;
9513
9514	PARSED_SKIP_FAILED:
9515	*errcodeptr = ERR90;
9516	return -`1`;
9517	}
9518
9519
9520
9521	/*************************************************
9522	* Set lengths in a lookbehind *
9523	*************************************************/
9524
9525	/ This function is called for each lookbehind, to set the lengths in its*
9526	branches. An error occurs if any branch does not have a fixed length that is
9527	less than the maximum (65535). On exit, the pointer must be left on the final
9528	ket.
9529
9530	The function also maintains the max_lookbehind value. Any lookbehind branch
9531	that contains a nested lookbehind may actually look further back than the
9532	length of the branch. The additional amount is passed back from
9533	get_branchlength() as an "extra" value.
9534
9535	Arguments:
9536	pptrptr pointer to pointer in the parsed pattern
9537	errcodeptr pointer to error code
9538	lcptr pointer to loop counter
9539	recurses chain of recurse_check to catch mutual recursion
9540	cb pointer to compile block
9541
9542	Returns: TRUE if all is well
9543	FALSE otherwise, with error code and offset set
9544	*/
9545
9546	static BOOL
9547	set_lookbehind_lengths(uint32_t *pptrptr, int* errcodeptr, int* *lcptr,
9548	parsed_recurse_check recurses, compile_block cb)
9549	{
9550	PCRE2_SIZE offset;
9551	int branchlength;
9552	uint32_t bptr = pptrptr;
9553
9554	READPLUSOFFSET(offset, bptr); / Offset for error messages /
9555	*pptrptr += SIZEOFFSET;
9556
9557	do
9558	{
9559	*pptrptr += `1`;
9560	branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
9561	if (branchlength < `0`)
9562	{
9563	/ The errorcode and offset may already be set from a nested lookbehind. /
9564	if (errcodeptr == `0`) errcodeptr = ERR25;
9565	if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9566	return FALSE;
9567	}
9568	if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9569	bptr \|= branchlength; /* branchlength never more than 65535 /
9570	bptr = *pptrptr;
9571	}
9572	while (*bptr == META_ALT);
9573
9574	return TRUE;
9575	}
9576
9577
9578
9579	/*************************************************
9580	* Check parsed pattern lookbehinds *
9581	*************************************************/
9582
9583	/ This function is called at the end of parsing a pattern if any lookbehinds*
9584	were encountered. It scans the parsed pattern for them, calling
9585	set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9586	the error offset is marked unset. The enables the functions above not to
9587	override settings from deeper nestings.
9588
9589	This function is called recursively from get_branchlength() for lookaheads in
9590	order to process any lookbehinds that they may contain. It stops when it hits a
9591	non-nested closing parenthesis in this case, returning a pointer to it.
9592
9593	Arguments
9594	pptr points to where to start (start of pattern or start of lookahead)
9595	retptr if not NULL, return the ket pointer here
9596	recurses chain of recurse_check to catch mutual recursion
9597	cb points to the compile block
9598	lcptr points to loop counter
9599
9600	Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
9601	*/
9602
9603	static int
9604	check_lookbehinds(uint32_t pptr, uint32_t *retptr,
9605	parsed_recurse_check recurses, compile_block cb, int *lcptr)
9606	{
9607	int errorcode = `0`;
9608	int nestlevel = `0`;
9609
9610	cb->erroroffset = PCRE2_UNSET;
9611
9612	for (; *pptr != META_END; pptr++)
9613	{
9614	if (pptr < META_END) continue; /* Literal /
9615
9616	switch (META_CODE(*pptr))
9617	{
9618	default:
9619	return ERR70; / Unrecognized meta code /
9620
9621	case META_ESCAPE:
9622	if (pptr - META_ESCAPE == ESC_P \|\| pptr - META_ESCAPE == ESC_p)
9623	pptr += `1`;
9624	break;
9625
9626	case META_KET:
9627	if (--nestlevel < `0`)
9628	{
9629	if (retptr != NULL) *retptr = pptr;
9630	return `0`;
9631	}
9632	break;
9633
9634	case META_ATOMIC:
9635	case META_CAPTURE:
9636	case META_COND_ASSERT:
9637	case META_LOOKAHEAD:
9638	case META_LOOKAHEADNOT:
9639	case META_LOOKAHEAD_NA:
9640	case META_NOCAPTURE:
9641	case META_SCRIPT_RUN:
9642	nestlevel++;
9643	break;
9644
9645	case META_ACCEPT:
9646	case META_ALT:
9647	case META_ASTERISK:
9648	case META_ASTERISK_PLUS:
9649	case META_ASTERISK_QUERY:
9650	case META_BACKREF:
9651	case META_CIRCUMFLEX:
9652	case META_CLASS:
9653	case META_CLASS_EMPTY:
9654	case META_CLASS_EMPTY_NOT:
9655	case META_CLASS_END:
9656	case META_CLASS_NOT:
9657	case META_COMMIT:
9658	case META_DOLLAR:
9659	case META_DOT:
9660	case META_FAIL:
9661	case META_PLUS:
9662	case META_PLUS_PLUS:
9663	case META_PLUS_QUERY:
9664	case META_PRUNE:
9665	case META_QUERY:
9666	case META_QUERY_PLUS:
9667	case META_QUERY_QUERY:
9668	case META_RANGE_ESCAPED:
9669	case META_RANGE_LITERAL:
9670	case META_SKIP:
9671	case META_THEN:
9672	break;
9673
9674	case META_RECURSE:
9675	pptr += SIZEOFFSET;
9676	break;
9677
9678	case META_BACKREF_BYNAME:
9679	case META_RECURSE_BYNAME:
9680	pptr += `1` + SIZEOFFSET;
9681	break;
9682
9683	case META_COND_DEFINE:
9684	pptr += SIZEOFFSET;
9685	nestlevel++;
9686	break;
9687
9688	case META_COND_NAME:
9689	case META_COND_NUMBER:
9690	case META_COND_RNAME:
9691	case META_COND_RNUMBER:
9692	pptr += `1` + SIZEOFFSET;
9693	nestlevel++;
9694	break;
9695
9696	case META_COND_VERSION:
9697	pptr += `3`;
9698	nestlevel++;
9699	break;
9700
9701	case META_CALLOUT_STRING:
9702	pptr += `3` + SIZEOFFSET;
9703	break;
9704
9705	case META_BIGVALUE:
9706	case META_OPTIONS:
9707	case META_POSIX:
9708	case META_POSIX_NEG:
9709	pptr += `1`;
9710	break;
9711
9712	case META_MINMAX:
9713	case META_MINMAX_QUERY:
9714	case META_MINMAX_PLUS:
9715	pptr += `2`;
9716	break;
9717
9718	case META_CALLOUT_NUMBER:
9719	pptr += `3`;
9720	break;
9721
9722	case META_MARK:
9723	case META_COMMIT_ARG:
9724	case META_PRUNE_ARG:
9725	case META_SKIP_ARG:
9726	case META_THEN_ARG:
9727	pptr += `1` + pptr[`1`];
9728	break;
9729
9730	case META_LOOKBEHIND:
9731	case META_LOOKBEHINDNOT:
9732	case META_LOOKBEHIND_NA:
9733	if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
9734	return errorcode;
9735	break;
9736	}
9737	}
9738
9739	return `0`;
9740	}
9741
9742
9743
9744	/*************************************************
9745	* External function to compile a pattern *
9746	*************************************************/
9747
9748	/ This function reads a regular expression in the form of a string and returns*
9749	a pointer to a block of store holding a compiled version of the expression.
9750
9751	Arguments:
9752	pattern the regular expression
9753	patlen the length of the pattern, or PCRE2_ZERO_TERMINATED
9754	options option bits
9755	errorptr pointer to errorcode
9756	erroroffset pointer to error offset
9757	ccontext points to a compile context or is NULL
9758
9759	Returns: pointer to compiled data block, or NULL on error,
9760	with errorcode and erroroffset set
9761	*/
9762
9763	PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
9764	pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
9765	int errorptr, PCRE2_SIZE erroroffset, pcre2_compile_context *ccontext)
9766	{
9767	BOOL utf; / Set TRUE for UTF mode /
9768	BOOL ucp; / Set TRUE for UCP mode /
9769	BOOL has_lookbehind = FALSE; / Set TRUE if a lookbehind is found /
9770	BOOL zero_terminated; / Set TRUE for zero-terminated pattern /
9771	pcre2_real_code re = NULL; /* What we will return /
9772	compile_block cb; / "Static" compile-time data /
9773	const uint8_t tables; /* Char tables base pointer /
9774
9775	PCRE2_UCHAR code; /* Current pointer in compiled code /
9776	PCRE2_SPTR codestart; / Start of compiled code /
9777	PCRE2_SPTR ptr; / Current pointer in pattern /
9778	uint32_t pptr; /* Current pointer in parsed pattern /
9779
9780	PCRE2_SIZE length = `1`; / Allow for final END opcode /
9781	PCRE2_SIZE usedlength; / Actual length used /
9782	PCRE2_SIZE re_blocksize; / Size of memory block /
9783	PCRE2_SIZE big32count = `0`; / 32-bit literals >= 0x80000000 /
9784	PCRE2_SIZE parsed_size_needed; / Needed for parsed pattern /
9785
9786	uint32_t firstcuflags, reqcuflags; / Type of first/req code unit /
9787	uint32_t firstcu, reqcu; / Value of first/req code unit /
9788	uint32_t setflags = `0`; / NL and BSR set flags /
9789
9790	uint32_t skipatstart; / When checking (UTF) etc /*
9791	uint32_t limit_heap = UINT32_MAX;
9792	uint32_t limit_match = UINT32_MAX; / Unset match limits /
9793	uint32_t limit_depth = UINT32_MAX;
9794
9795	int newline = `0`; / Unset; can be set by the pattern /
9796	int bsr = `0`; / Unset; can be set by the pattern /
9797	int errorcode = `0`; / Initialize to avoid compiler warn /
9798	int regexrc; / Return from compile /
9799
9800	uint32_t i; / Local loop counter /
9801
9802	/ Comments at the head of this file explain about these variables. /
9803
9804	uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
9805	uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
9806	named_group named_groups[NAMED_GROUP_LIST_SIZE];
9807
9808	/ The workspace is used in different ways in the different compiling phases.*
9809	It needs to be 16-bit aligned for the preliminary parsing scan. /*
9810
9811	uint32_t c16workspace[C16_WORK_SIZE];
9812	PCRE2_UCHAR cworkspace = (PCRE2_UCHAR )c16workspace;
9813
9814
9815	/ -------------- Check arguments and set up the pattern ----------------- /
9816
9817	/ There must be error code and offset pointers. /
9818
9819	if (errorptr == NULL \|\| erroroffset == NULL) return NULL;
9820	*errorptr = ERR0;
9821	*erroroffset = `0`;
9822
9823	/ There must be a pattern! /
9824
9825	if (pattern == NULL)
9826	{
9827	*errorptr = ERR16;
9828	return NULL;
9829	}
9830
9831	/ A NULL compile context means "use a default context" /
9832
9833	if (ccontext == NULL)
9834	ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
9835
9836	/ PCRE2_MATCH_INVALID_UTF implies UTF /
9837
9838	if ((options & PCRE2_MATCH_INVALID_UTF) != `0`) options \|= PCRE2_UTF;
9839
9840	/ Check that all undefined public option bits are zero. /
9841
9842	if ((options & ~PUBLIC_COMPILE_OPTIONS) != `0` \|\|
9843	(ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != `0`)
9844	{
9845	*errorptr = ERR17;
9846	return NULL;
9847	}
9848
9849	if ((options & PCRE2_LITERAL) != `0` &&
9850	((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != `0` \|\|
9851	(ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != `0`))
9852	{
9853	*errorptr = ERR92;
9854	return NULL;
9855	}
9856
9857	/ A zero-terminated pattern is indicated by the special length value*
9858	PCRE2_ZERO_TERMINATED. Check for an overlong pattern. /*
9859
9860	if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
9861	patlen = PRIV(strlen)(pattern);
9862
9863	if (patlen > ccontext->max_pattern_length)
9864	{
9865	*errorptr = ERR88;
9866	return NULL;
9867	}
9868
9869	/ From here on, all returns from this function should end up going via the*
9870	EXIT label. /*
9871
9872
9873	/ ------------ Initialize the "static" compile data -------------- /
9874
9875	tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
9876
9877	cb.lcc = tables + lcc_offset; / Individual /
9878	cb.fcc = tables + fcc_offset; / character /
9879	cb.cbits = tables + cbits_offset; / tables /
9880	cb.ctypes = tables + ctypes_offset;
9881
9882	cb.assert_depth = `0`;
9883	cb.bracount = `0`;
9884	cb.cx = ccontext;
9885	cb.dupnames = FALSE;
9886	cb.end_pattern = pattern + patlen;
9887	cb.erroroffset = `0`;
9888	cb.external_flags = `0`;
9889	cb.external_options = options;
9890	cb.groupinfo = stack_groupinfo;
9891	cb.had_recurse = FALSE;
9892	cb.lastcapture = `0`;
9893	cb.max_lookbehind = `0`;
9894	cb.name_entry_size = `0`;
9895	cb.name_table = NULL;
9896	cb.named_groups = named_groups;
9897	cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
9898	cb.names_found = `0`;
9899	cb.open_caps = NULL;
9900	cb.parens_depth = `0`;
9901	cb.parsed_pattern = stack_parsed_pattern;
9902	cb.req_varyopt = `0`;
9903	cb.start_code = cworkspace;
9904	cb.start_pattern = pattern;
9905	cb.start_workspace = cworkspace;
9906	cb.workspace_size = COMPILE_WORK_SIZE;
9907
9908	/ Maximum back reference and backref bitmap. The bitmap records up to 31 back*
9909	references to help in deciding whether (.) can be treated as anchored or not.*
9910	*/
9911
9912	cb.top_backref = `0`;
9913	cb.backref_map = `0`;
9914
9915	/ Escape sequences \1 to \9 are always back references, but as they are only*
9916	two characters long, only two elements can be used in the parsed_pattern
9917	vector. The first contains the reference, and we'd like to use the second to
9918	record the offset in the pattern, so that forward references to non-existent
9919	groups can be diagnosed later with an offset. However, on 64-bit systems,
9920	PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
9921	occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
9922	references have enough space for the offset to be put into the parsed pattern.
9923	*/
9924
9925	for (i = `0`; i < `10`; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
9926
9927
9928	/ --------------- Start looking at the pattern --------------- /
9929
9930	/ Unless PCRE2_LITERAL is set, check for global one-time option settings at*
9931	the start of the pattern, and remember the offset to the actual regex. With
9932	valgrind support, make the terminator of a zero-terminated pattern
9933	inaccessible. This catches bugs that would otherwise only show up for
9934	non-zero-terminated patterns. /*
9935
9936	#ifdef SUPPORT_VALGRIND
9937	if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(`1`));
9938	#endif
9939
9940	ptr = pattern;
9941	skipatstart = `0`;
9942
9943	if ((options & PCRE2_LITERAL) == `0`)
9944	{
9945	while (patlen - skipatstart >= `2` &&
9946	ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9947	ptr[skipatstart+`1`] == CHAR_ASTERISK)
9948	{
9949	for (i = `0`; i < sizeof(pso_list)/sizeof(pso); i++)
9950	{
9951	uint32_t c, pp;
9952	pso *p = pso_list + i;
9953
9954	if (patlen - skipatstart - `2` >= p->length &&
9955	PRIV(strncmp_c8)(ptr + skipatstart + `2`, (char *)(p->name),
9956	p->length) == `0`)
9957	{
9958	skipatstart += p->length + `2`;
9959	switch(p->type)
9960	{
9961	case PSO_OPT:
9962	cb.external_options \|= p->value;
9963	break;
9964
9965	case PSO_FLG:
9966	setflags \|= p->value;
9967	break;
9968
9969	case PSO_NL:
9970	newline = p->value;
9971	setflags \|= PCRE2_NL_SET;
9972	break;
9973
9974	case PSO_BSR:
9975	bsr = p->value;
9976	setflags \|= PCRE2_BSR_SET;
9977	break;
9978
9979	case PSO_LIMM:
9980	case PSO_LIMD:
9981	case PSO_LIMH:
9982	c = `0`;
9983	pp = skipatstart;
9984	if (!IS_DIGIT(ptr[pp]))
9985	{
9986	errorcode = ERR60;
9987	ptr += pp;
9988	goto HAD_EARLY_ERROR;
9989	}
9990	while (IS_DIGIT(ptr[pp]))
9991	{
9992	if (c > UINT32_MAX / `10` - `1`) break; / Integer overflow /
9993	c = c*`10` + (ptr[pp++] - CHAR_0);
9994	}
9995	if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
9996	{
9997	errorcode = ERR60;
9998	ptr += pp;
9999	goto HAD_EARLY_ERROR;
10000	}
10001	if (p->type == PSO_LIMH) limit_heap = c;
10002	else if (p->type == PSO_LIMM) limit_match = c;
10003	else limit_depth = c;
10004	skipatstart += pp - skipatstart;
10005	break;
10006	}
10007	break; / Out of the table scan loop /
10008	}
10009	}
10010	if (i >= sizeof(pso_list)/sizeof(pso)) break; / Out of pso loop /
10011	}
10012	}
10013
10014	/ End of pattern-start options; advance to start of real regex. /
10015
10016	ptr += skipatstart;
10017
10018	/ Can't support UTF or UCP if PCRE2 was built without Unicode support. /
10019
10020	#ifndef SUPPORT_UNICODE
10021	if ((cb.external_options & (PCRE2_UTF\|PCRE2_UCP)) != `0`)
10022	{
10023	errorcode = ERR32;
10024	goto HAD_EARLY_ERROR;
10025	}
10026	#endif
10027
10028	/ Check UTF. We have the original options in 'options', with that value as*
10029	modified by (UTF) etc in cb->external_options. The extra option*
10030	PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10031	surrogate code points cannot be represented in UTF-16. /*
10032
10033	utf = (cb.external_options & PCRE2_UTF) != `0`;
10034	if (utf)
10035	{
10036	if ((options & PCRE2_NEVER_UTF) != `0`)
10037	{
10038	errorcode = ERR74;
10039	goto HAD_EARLY_ERROR;
10040	}
10041	if ((options & PCRE2_NO_UTF_CHECK) == `0` &&
10042	(errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != `0`)
10043	goto HAD_ERROR; / Offset was set by valid_utf() /
10044
10045	#if PCRE2_CODE_UNIT_WIDTH == 16
10046	if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != `0`)
10047	{
10048	errorcode = ERR91;
10049	goto HAD_EARLY_ERROR;
10050	}
10051	#endif
10052	}
10053
10054	/ Check UCP lockout. /
10055
10056	ucp = (cb.external_options & PCRE2_UCP) != `0`;
10057	if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != `0`)
10058	{
10059	errorcode = ERR75;
10060	goto HAD_EARLY_ERROR;
10061	}
10062
10063	/ Process the BSR setting. /
10064
10065	if (bsr == `0`) bsr = ccontext->bsr_convention;
10066
10067	/ Process the newline setting. /
10068
10069	if (newline == `0`) newline = ccontext->newline_convention;
10070	cb.nltype = NLTYPE_FIXED;
10071	switch(newline)
10072	{
10073	case PCRE2_NEWLINE_CR:
10074	cb.nllen = `1`;
10075	cb.nl[`0`] = CHAR_CR;
10076	break;
10077
10078	case PCRE2_NEWLINE_LF:
10079	cb.nllen = `1`;
10080	cb.nl[`0`] = CHAR_NL;
10081	break;
10082
10083	case PCRE2_NEWLINE_NUL:
10084	cb.nllen = `1`;
10085	cb.nl[`0`] = CHAR_NUL;
10086	break;
10087
10088	case PCRE2_NEWLINE_CRLF:
10089	cb.nllen = `2`;
10090	cb.nl[`0`] = CHAR_CR;
10091	cb.nl[`1`] = CHAR_NL;
10092	break;
10093
10094	case PCRE2_NEWLINE_ANY:
10095	cb.nltype = NLTYPE_ANY;
10096	break;
10097
10098	case PCRE2_NEWLINE_ANYCRLF:
10099	cb.nltype = NLTYPE_ANYCRLF;
10100	break;
10101
10102	default:
10103	errorcode = ERR56;
10104	goto HAD_EARLY_ERROR;
10105	}
10106
10107	/ Pre-scan the pattern to do two things: (1) Discover the named groups and*
10108	their numerical equivalents, so that this information is always available for
10109	the remaining processing. (2) At the same time, parse the pattern and put a
10110	processed version into the parsed_pattern vector. This has escapes interpreted
10111	and comments removed (amongst other things).
10112
10113	In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
10114	32-bit ints in the parsed pattern is bounded by the length of the pattern plus
10115	one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
10116	set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
10117	characters greater than META_END (0x80000000) have to be coded as two units. In
10118	this case, therefore, we scan the pattern to check for such values. /*
10119
10120	#if PCRE2_CODE_UNIT_WIDTH == 32
10121	if (!utf)
10122	{
10123	PCRE2_SPTR p;
10124	for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
10125	}
10126	#endif
10127
10128	/ Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT*
10129	is set we have to assume a numerical callout (4 elements) for each character
10130	plus one at the end. This is overkill, but memory is plentiful these days. For
10131	many smaller patterns the vector on the stack (which was set up above) can be
10132	used. /*
10133
10134	parsed_size_needed = patlen - skipatstart + big32count;
10135
10136	if ((ccontext->extra_options &
10137	(PCRE2_EXTRA_MATCH_WORD\|PCRE2_EXTRA_MATCH_LINE)) != `0`)
10138	parsed_size_needed += `4`;
10139
10140	if ((options & PCRE2_AUTO_CALLOUT) != `0`)
10141	parsed_size_needed = (parsed_size_needed + `1`) * `5`;
10142
10143	if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
10144	{
10145	uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10146	(parsed_size_needed + `1`) * sizeof(uint32_t), ccontext->memctl.memory_data);
10147	if (heap_parsed_pattern == NULL)
10148	{
10149	*errorptr = ERR21;
10150	goto EXIT;
10151	}
10152	cb.parsed_pattern = heap_parsed_pattern;
10153	}
10154	cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + `1`;
10155
10156	/ Do the parsing scan. /
10157
10158	errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
10159	if (errorcode != `0`) goto HAD_CB_ERROR;
10160
10161	/ Workspace is needed to remember information about numbered groups: whether a*
10162	group can match an empty string and what its fixed length is. This is done to
10163	avoid the possibility of recursive references causing very long compile times
10164	when checking these features. Unnumbered groups do not have this exposure since
10165	they cannot be referenced. We use an indexed vector for this purpose. If there
10166	are sufficiently few groups, the default vector on the stack, as set up above,
10167	can be used. Otherwise we have to get/free a special vector. The vector must be
10168	initialized to zero. /*
10169
10170	if (cb.bracount >= GROUPINFO_DEFAULT_SIZE)
10171	{
10172	cb.groupinfo = ccontext->memctl.malloc(
10173	(cb.bracount + `1`)*sizeof(uint32_t), ccontext->memctl.memory_data);
10174	if (cb.groupinfo == NULL)
10175	{
10176	errorcode = ERR21;
10177	cb.erroroffset = `0`;
10178	goto HAD_CB_ERROR;
10179	}
10180	}
10181	memset(cb.groupinfo, `0`, (cb.bracount + `1`) * sizeof(uint32_t));
10182
10183	/ If there were any lookbehinds, scan the parsed pattern to figure out their*
10184	lengths. /*
10185
10186	if (has_lookbehind)
10187	{
10188	int loopcount = `0`;
10189	errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10190	if (errorcode != `0`) goto HAD_CB_ERROR;
10191	}
10192
10193	/ For debugging, there is a function that shows the parsed data vector. /
10194
10195	#ifdef DEBUG_SHOW_PARSED
10196	fprintf(stderr, "+++ Pre-scan complete:\n");
10197	show_parsed(&cb);
10198	#endif
10199
10200	/ For debugging capturing information this code can be enabled. /
10201
10202	#ifdef DEBUG_SHOW_CAPTURES
10203	{
10204	named_group *ng = cb.named_groups;
10205	fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10206	for (i = `0`; i < cb.names_found; i++, ng++)
10207	{
10208	fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10209	}
10210	}
10211	#endif
10212
10213	/ Pretend to compile the pattern while actually just accumulating the amount*
10214	of memory required in the 'length' variable. This behaviour is triggered by
10215	passing a non-NULL final argument to compile_regex(). We pass a block of
10216	workspace (cworkspace) for it to compile parts of the pattern into; the
10217	compiled code is discarded when it is no longer needed, so hopefully this
10218	workspace will never overflow, though there is a test for its doing so.
10219
10220	On error, errorcode will be set non-zero, so we don't need to look at the
10221	result of the function. The initial options have been put into the cb block,
10222	but we still have to pass a separate options variable (the first argument)
10223	because the options may change as the pattern is processed. /*
10224
10225	cb.erroroffset = patlen; / For any subsequent errors that do not set it /
10226	pptr = cb.parsed_pattern;
10227	code = cworkspace;
10228	*code = OP_BRA;
10229
10230	(void)compile_regex(cb.external_options, &code, &pptr, &errorcode, `0`, &firstcu,
10231	&firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length);
10232
10233	if (errorcode != `0`) goto HAD_CB_ERROR; / Offset is in cb.erroroffset /
10234
10235	/ This should be caught in compile_regex(), but just in case... /
10236
10237	if (length > MAX_PATTERN_SIZE)
10238	{
10239	errorcode = ERR20;
10240	goto HAD_CB_ERROR;
10241	}
10242
10243	/ Compute the size of, and then get and initialize, the data block for storing*
10244	the compiled pattern and names table. Integer overflow should no longer be
10245	possible because nowadays we limit the maximum value of cb.names_found and
10246	cb.name_entry_size. /*
10247
10248	re_blocksize = sizeof(pcre2_real_code) +
10249	CU2BYTES(length +
10250	(PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10251	re = (pcre2_real_code *)
10252	ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10253	if (re == NULL)
10254	{
10255	errorcode = ERR21;
10256	goto HAD_CB_ERROR;
10257	}
10258
10259	/ The compiler may put padding at the end of the pcre2_real_code structure in*
10260	order to round it up to a multiple of 4 or 8 bytes. This means that when a
10261	compiled pattern is copied (for example, when serialized) undefined bytes are
10262	read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10263	write to the last 8 bytes of the structure before setting the fields. /*
10264
10265	memset((char )re + sizeof*(pcre2_real_code) - `8`, `0`, `8`);
10266	re->memctl = ccontext->memctl;
10267	re->tables = tables;
10268	re->executable_jit = NULL;
10269	memset(re->start_bitmap, `0`, `32` * sizeof(uint8_t));
10270	re->blocksize = re_blocksize;
10271	re->magic_number = MAGIC_NUMBER;
10272	re->compile_options = options;
10273	re->overall_options = cb.external_options;
10274	re->extra_options = ccontext->extra_options;
10275	re->flags = PCRE2_CODE_UNIT_WIDTH/`8` \| cb.external_flags \| setflags;
10276	re->limit_heap = limit_heap;
10277	re->limit_match = limit_match;
10278	re->limit_depth = limit_depth;
10279	re->first_codeunit = `0`;
10280	re->last_codeunit = `0`;
10281	re->bsr_convention = bsr;
10282	re->newline_convention = newline;
10283	re->max_lookbehind = `0`;
10284	re->minlength = `0`;
10285	re->top_bracket = `0`;
10286	re->top_backref = `0`;
10287	re->name_entry_size = cb.name_entry_size;
10288	re->name_count = cb.names_found;
10289
10290	/ The basic block is immediately followed by the name table, and the compiled*
10291	code follows after that. /*
10292
10293	codestart = (PCRE2_SPTR)((uint8_t )re + sizeof*(pcre2_real_code)) +
10294	re->name_entry_size * re->name_count;
10295
10296	/ Update the compile data block for the actual compile. The starting points of*
10297	the name/number translation table and of the code are passed around in the
10298	compile data block. The start/end pattern and initial options are already set
10299	from the pre-compile phase, as is the name_entry_size field. /*
10300
10301	cb.parens_depth = `0`;
10302	cb.assert_depth = `0`;
10303	cb.lastcapture = `0`;
10304	cb.name_table = (PCRE2_UCHAR )((uint8_t )re + sizeof(pcre2_real_code));
10305	cb.start_code = codestart;
10306	cb.req_varyopt = `0`;
10307	cb.had_accept = FALSE;
10308	cb.had_pruneorskip = FALSE;
10309	cb.open_caps = NULL;
10310
10311	/ If any named groups were found, create the name/number table from the list*
10312	created in the pre-pass. /*
10313
10314	if (cb.names_found > `0`)
10315	{
10316	named_group *ng = cb.named_groups;
10317	for (i = `0`; i < cb.names_found; i++, ng++)
10318	add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10319	}
10320
10321	/ Set up a starting, non-extracting bracket, then compile the expression. On*
10322	error, errorcode will be set non-zero, so we don't need to look at the result
10323	of the function here. /*
10324
10325	pptr = cb.parsed_pattern;
10326	code = (PCRE2_UCHAR *)codestart;
10327	*code = OP_BRA;
10328	regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, `0`,
10329	&firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
10330	if (regexrc < `0`) re->flags \|= PCRE2_MATCH_EMPTY;
10331	re->top_bracket = cb.bracount;
10332	re->top_backref = cb.top_backref;
10333	re->max_lookbehind = cb.max_lookbehind;
10334
10335	if (cb.had_accept)
10336	{
10337	reqcu = `0`; / Must disable after (ACCEPT) /*
10338	reqcuflags = REQ_NONE;
10339	re->flags \|= PCRE2_HASACCEPT; / Disables minimum length /
10340	}
10341
10342	/ Fill in the final opcode and check for disastrous overflow. If no overflow,*
10343	but the estimated length exceeds the really used length, adjust the value of
10344	re->blocksize, and if valgrind support is configured, mark the extra allocated
10345	memory as unaddressable, so that any out-of-bound reads can be detected. /*
10346
10347	*code++ = OP_END;
10348	usedlength = code - codestart;
10349	if (usedlength > length) errorcode = ERR23; else
10350	{
10351	re->blocksize -= CU2BYTES(length - usedlength);
10352	#ifdef SUPPORT_VALGRIND
10353	VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10354	#endif
10355	}
10356
10357	/ Scan the pattern for recursion/subroutine calls and convert the group*
10358	numbers into offsets. Maintain a small cache so that repeated groups containing
10359	recursions are efficiently handled. /*
10360
10361	#define RSCAN_CACHE_SIZE 8
10362
10363	if (errorcode == `0` && cb.had_recurse)
10364	{
10365	PCRE2_UCHAR *rcode;
10366	PCRE2_SPTR rgroup;
10367	unsigned int ccount = `0`;
10368	int start = RSCAN_CACHE_SIZE;
10369	recurse_cache rc[RSCAN_CACHE_SIZE];
10370
10371	for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10372	rcode != NULL;
10373	rcode = (PCRE2_UCHAR *)find_recurse(rcode + `1` + LINK_SIZE, utf))
10374	{
10375	int p, groupnumber;
10376
10377	groupnumber = (int)GET(rcode, `1`);
10378	if (groupnumber == `0`) rgroup = codestart; else
10379	{
10380	PCRE2_SPTR search_from = codestart;
10381	rgroup = NULL;
10382	for (i = `0`, p = start; i < ccount; i++, p = (p + `1`) & `7`)
10383	{
10384	if (groupnumber == rc[p].groupnumber)
10385	{
10386	rgroup = rc[p].group;
10387	break;
10388	}
10389
10390	/ Group n+1 must always start to the right of group n, so we can save*
10391	search time below when the new group number is greater than any of the
10392	previously found groups. /*
10393
10394	if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10395	}
10396
10397	if (rgroup == NULL)
10398	{
10399	rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10400	if (rgroup == NULL)
10401	{
10402	errorcode = ERR53;
10403	break;
10404	}
10405	if (--start < `0`) start = RSCAN_CACHE_SIZE - `1`;
10406	rc[start].groupnumber = groupnumber;
10407	rc[start].group = rgroup;
10408	if (ccount < RSCAN_CACHE_SIZE) ccount++;
10409	}
10410	}
10411
10412	PUT(rcode, `1`, rgroup - codestart);
10413	}
10414	}
10415
10416	/ In rare debugging situations we sometimes need to look at the compiled code*
10417	at this stage. /*
10418
10419	#ifdef DEBUG_CALL_PRINTINT
10420	pcre2_printint(re, stderr, TRUE);
10421	fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10422	#endif
10423
10424	/ Unless disabled, check whether any single character iterators can be*
10425	auto-possessified. The function overwrites the appropriate opcode values, so
10426	the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10427	used in this code because at least one compiler gives a warning about loss of
10428	"const" attribute if the cast (PCRE2_UCHAR )codestart is used directly in the*
10429	function call. /*
10430
10431	if (errorcode == `0` && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == `0`)
10432	{
10433	PCRE2_UCHAR temp = (PCRE2_UCHAR )codestart;
10434	if (PRIV(auto_possessify)(temp, &cb) != `0`) errorcode = ERR80;
10435	}
10436
10437	/ Failed to compile, or error while post-processing. /
10438
10439	if (errorcode != `0`) goto HAD_CB_ERROR;
10440
10441	/ Successful compile. If the anchored option was not passed, set it if*
10442	we can determine that the pattern is anchored by virtue of ^ characters or \A
10443	or anything else, such as starting with non-atomic . when DOTALL is set and*
10444	there are no occurrences of PRUNE or SKIP (though there is an option to
10445	disable this case). /*
10446
10447	if ((re->overall_options & PCRE2_ANCHORED) == `0` &&
10448	is_anchored(codestart, `0`, &cb, `0`, FALSE))
10449	re->overall_options \|= PCRE2_ANCHORED;
10450
10451	/ Set up the first code unit or startline flag, the required code unit, and*
10452	then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10453	is set, as the data it would create will not be used. Note that a first code
10454	unit (but not the startline flag) is useful for anchored patterns because it
10455	can still give a quick "no match" and also avoid searching for a last code
10456	unit. /*
10457
10458	if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == `0`)
10459	{
10460	int minminlength = `0`; / For minimal minlength from first/required CU /
10461
10462	/ If we do not have a first code unit, see if there is one that is asserted*
10463	(these are not saved during the compile because they can cause conflicts with
10464	actual literals that follow). /*
10465
10466	if (firstcuflags >= REQ_NONE)
10467	firstcu = find_firstassertedcu(codestart, &firstcuflags, `0`);
10468
10469	/ Save the data for a first code unit. The existence of one means the*
10470	minimum length must be at least 1. /*
10471
10472	if (firstcuflags < REQ_NONE)
10473	{
10474	re->first_codeunit = firstcu;
10475	re->flags \|= PCRE2_FIRSTSET;
10476	minminlength++;
10477
10478	/ Handle caseless first code units. /
10479
10480	if ((firstcuflags & REQ_CASELESS) != `0`)
10481	{
10482	if (firstcu < `128` \|\| (!utf && !ucp && firstcu < `255`))
10483	{
10484	if (cb.fcc[firstcu] != firstcu) re->flags \|= PCRE2_FIRSTCASELESS;
10485	}
10486
10487	/ The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.*
10488	In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
10489	points and cannot have another case, but if UCP is set they may do. /*
10490
10491	#ifdef SUPPORT_UNICODE
10492	#if PCRE2_CODE_UNIT_WIDTH == 8
10493	else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10494	re->flags \|= PCRE2_FIRSTCASELESS;
10495	#else
10496	else if ((utf \|\| ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10497	UCD_OTHERCASE(firstcu) != firstcu)
10498	re->flags \|= PCRE2_FIRSTCASELESS;
10499	#endif
10500	#endif /* SUPPORT_UNICODE */
10501	}
10502	}
10503
10504	/ When there is no first code unit, for non-anchored patterns, see if we can*
10505	set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10506	branches start with ^ and also when all branches start with non-atomic . for*
10507	non-DOTALL matches when PRUNE and SKIP are not present. (There is an option*
10508	that disables this case.) /*
10509
10510	else if ((re->overall_options & PCRE2_ANCHORED) == `0` &&
10511	is_startline(codestart, `0`, &cb, `0`, FALSE))
10512	re->flags \|= PCRE2_STARTLINE;
10513
10514	/ Handle the "required code unit", if one is set. In the UTF case we can*
10515	increment the minimum minimum length only if we are sure this really is a
10516	different character and not a non-starting code unit of the first character,
10517	because the minimum length count is in characters, not code units. /*
10518
10519	if (reqcuflags < REQ_NONE)
10520	{
10521	#if PCRE2_CODE_UNIT_WIDTH == 16
10522	if ((re->overall_options & PCRE2_UTF) == `0` \|\| / Not UTF /
10523	firstcuflags >= REQ_NONE \|\| / First not set /
10524	(firstcu & `0xf800`) != `0xd800` \|\| / First not surrogate /
10525	(reqcu & `0xfc00`) != `0xdc00`) / Req not low surrogate /
10526	#elif PCRE2_CODE_UNIT_WIDTH == 8
10527	if ((re->overall_options & PCRE2_UTF) == `0` \|\| / Not UTF /
10528	firstcuflags >= REQ_NONE \|\| / First not set /
10529	(firstcu & `0x80`) == `0` \|\| / First is ASCII /
10530	(reqcu & `0x80`) == `0`) / Req is ASCII /
10531	#endif
10532	{
10533	minminlength++;
10534	}
10535
10536	/ In the case of an anchored pattern, set up the value only if it follows*
10537	a variable length item in the pattern. /*
10538
10539	if ((re->overall_options & PCRE2_ANCHORED) == `0` \|\|
10540	(reqcuflags & REQ_VARY) != `0`)
10541	{
10542	re->last_codeunit = reqcu;
10543	re->flags \|= PCRE2_LASTSET;
10544
10545	/ Handle caseless required code units as for first code units (above). /
10546
10547	if ((reqcuflags & REQ_CASELESS) != `0`)
10548	{
10549	if (reqcu < `128` \|\| (!utf && !ucp && reqcu < `255`))
10550	{
10551	if (cb.fcc[reqcu] != reqcu) re->flags \|= PCRE2_LASTCASELESS;
10552	}
10553	#ifdef SUPPORT_UNICODE
10554	#if PCRE2_CODE_UNIT_WIDTH == 8
10555	else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
10556	re->flags \|= PCRE2_LASTCASELESS;
10557	#else
10558	else if ((utf \|\| ucp) && reqcu <= MAX_UTF_CODE_POINT &&
10559	UCD_OTHERCASE(reqcu) != reqcu)
10560	re->flags \|= PCRE2_LASTCASELESS;
10561	#endif
10562	#endif /* SUPPORT_UNICODE */
10563	}
10564	}
10565	}
10566
10567	/ Study the compiled pattern to set up information such as a bitmap of*
10568	starting code units and a minimum matching length. /*
10569
10570	if (PRIV(study)(re) != `0`)
10571	{
10572	errorcode = ERR31;
10573	goto HAD_CB_ERROR;
10574	}
10575
10576	/ If study() set a bitmap of starting code units, it implies a minimum*
10577	length of at least one. /*
10578
10579	if ((re->flags & PCRE2_FIRSTMAPSET) != `0` && minminlength == `0`)
10580	minminlength = `1`;
10581
10582	/ If the minimum length set (or not set) by study() is less than the minimum*
10583	implied by required code units, override it. /*
10584
10585	if (re->minlength < minminlength) re->minlength = minminlength;
10586	} / End of start-of-match optimizations. /
10587
10588	/ Control ends up here in all cases. When running under valgrind, make a*
10589	pattern's terminating zero defined again. If memory was obtained for the parsed
10590	version of the pattern, free it before returning. Also free the list of named
10591	groups if a larger one had to be obtained, and likewise the group information
10592	vector. /*
10593
10594	EXIT:
10595	#ifdef SUPPORT_VALGRIND
10596	if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(`1`));
10597	#endif
10598	if (cb.parsed_pattern != stack_parsed_pattern)
10599	ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10600	if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10601	ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10602	if (cb.groupinfo != stack_groupinfo)
10603	ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10604	return re; / Will be NULL after an error /
10605
10606	/ Errors discovered in parse_regex() set the offset value in the compile*
10607	block. Errors discovered before it is called must compute it from the ptr
10608	value. After parse_regex() is called, the offset in the compile block is set to
10609	the end of the pattern, but certain errors in compile_regex() may reset it if
10610	an offset is available in the parsed pattern. /*
10611
10612	HAD_CB_ERROR:
10613	ptr = pattern + cb.erroroffset;
10614
10615	HAD_EARLY_ERROR:
10616	*erroroffset = ptr - pattern;
10617
10618	HAD_ERROR:
10619	*errorptr = errorcode;
10620	pcre2_code_free(re);
10621	re = NULL;
10622	goto EXIT;
10623	}
10624
10625	/ These #undefs are here to enable unity builds with CMake. /
10626
10627	#undef NLBLOCK /* Block containing newline information */
10628	#undef PSSTART /* Field containing processed string start */
10629	#undef PSEND /* Field containing processed string end */
10630
10631	/ End of pcre2_compile.c /
10632

Browse the source code of Godot/thirdparty/pcre2/src/pcre2_compile.c