1/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2018 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40#pragma warning( disable : 4018) // '<' : signed/unsigned mismatch
41#pragma warning( disable : 4127) // conditional expression is constant
42#pragma warning( disable : 4244) // conversion from 'int' to 'unsigned short', possible loss of data
43#pragma warning( disable : 4701) // local variable 'othercase' may be used without having been initialized
44#pragma warning( disable : 4702) // unreachable code
45
46/* This module contains the external function pcre_compile(), along with
47supporting internal functions that are not used by other modules. */
48
49#include "pcre_config.h"
50
51#define NLBLOCK cd /* Block containing newline information */
52#define PSSTART start_pattern /* Field containing pattern start */
53#define PSEND end_pattern /* Field containing pattern end */
54
55#include "pcre_internal.h"
56
57
58/* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
59is also used by pcretest. PCRE_DEBUG is not defined when building a production
60library. We do not need to select pcre16_printint.c specially, because the
61COMPILE_PCREx macro will already be appropriately set. */
62
63#ifdef PCRE_DEBUG
64/* pcre_printint.c should not include any headers */
65#define PCRE_INCLUDED
66#include "pcre_printint.c"
67#undef PCRE_INCLUDED
68#endif
69
70
71/* Macro for setting individual bits in class bitmaps. */
72
73#define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
74
75/* Maximum length value to check against when making sure that the integer that
76holds the compiled pattern length does not overflow. We make it a bit less than
77INT_MAX to allow for adding in group terminating bytes, so that we don't have
78to check them every time. */
79
80#define OFLOW_MAX (INT_MAX - 20)
81
82/* Definitions to allow mutual recursion */
83
84static int
85 add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
86 const pcre_uint32 *, unsigned int);
87
88static BOOL
89 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
90 pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
91 compile_data *, int *);
92
93
94
95/*************************************************
96* Code parameters and static tables *
97*************************************************/
98
99/* This value specifies the size of stack workspace that is used during the
100first pre-compile phase that determines how much memory is required. The regex
101is partly compiled into this space, but the compiled parts are discarded as
102soon as they can be, so that hopefully there will never be an overrun. The code
103does, however, check for an overrun. The largest amount I've seen used is 218,
104so this number is very generous.
105
106The same workspace is used during the second, actual compile phase for
107remembering forward references to groups so that they can be filled in at the
108end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
109is 4 there is plenty of room for most patterns. However, the memory can get
110filled up by repetitions of forward references, for example patterns like
111/(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
112that the workspace is expanded using malloc() in this situation. The value
113below is therefore a minimum, and we put a maximum on it for safety. The
114minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
115kicks in at the same number of forward references in all cases. */
116
117#define COMPILE_WORK_SIZE (2048*LINK_SIZE)
118#define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
119
120/* This value determines the size of the initial vector that is used for
121remembering named groups during the pre-compile. It is allocated on the stack,
122but if it is too small, it is expanded using malloc(), in a similar way to the
123workspace. The value is the number of slots in the list. */
124
125#define NAMED_GROUP_LIST_SIZE 20
126
127/* The overrun tests check for a slightly smaller size so that they detect the
128overrun before it actually does run off the end of the data block. */
129
130#define WORK_SIZE_SAFETY_MARGIN (100)
131
132/* Private flags added to firstchar and reqchar. */
133
134#define REQ_CASELESS (1 << 0) /* Indicates caselessness */
135#define REQ_VARY (1 << 1) /* Reqchar followed non-literal item */
136/* Negative values for the firstchar and reqchar flags */
137#define REQ_UNSET (-2)
138#define REQ_NONE (-1)
139
140/* Repeated character flags. */
141
142#define UTF_LENGTH 0x10000000l /* The char contains its length. */
143
144/* Table for handling escaped characters in the range '0'-'z'. Positive returns
145are simple data values; negative values are for special things like \d and so
146on. Zero means further processing is needed (for things like \x), or the escape
147is invalid. */
148
149#ifndef EBCDIC
150
151/* This is the "normal" table for ASCII systems or for EBCDIC systems running
152in UTF-8 mode. */
153
154static const short int escapes[] = {
155 0, 0,
156 0, 0,
157 0, 0,
158 0, 0,
159 0, 0,
160 CHAR_COLON, CHAR_SEMICOLON,
161 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
162 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
163 CHAR_COMMERCIAL_AT, -ESC_A,
164 -ESC_B, -ESC_C,
165 -ESC_D, -ESC_E,
166 0, -ESC_G,
167 -ESC_H, 0,
168 0, -ESC_K,
169 0, 0,
170 -ESC_N, 0,
171 -ESC_P, -ESC_Q,
172 -ESC_R, -ESC_S,
173 0, 0,
174 -ESC_V, -ESC_W,
175 -ESC_X, 0,
176 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
177 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
178 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
179 CHAR_GRAVE_ACCENT, ESC_a,
180 -ESC_b, 0,
181 -ESC_d, ESC_e,
182 ESC_f, 0,
183 -ESC_h, 0,
184 0, -ESC_k,
185 0, 0,
186 ESC_n, 0,
187 -ESC_p, 0,
188 ESC_r, -ESC_s,
189 ESC_tee, 0,
190 -ESC_v, -ESC_w,
191 0, 0,
192 -ESC_z
193};
194
195#else
196
197/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
198
199static const short int escapes[] = {
200/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
201/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
202/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
203/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
204/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
205/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
206/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
207/* 80 */ 0, ESC_a, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
208/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
209/* 90 */ 0, 0, -ESC_k, 0, 0, ESC_n, 0, -ESC_p,
210/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
211/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
212/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
213/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
214/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
215/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
216/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
217/* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
218/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
219/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
220/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
221/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
222/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
223};
224
225/* We also need a table of characters that may follow \c in an EBCDIC
226environment for characters 0-31. */
227
228static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
229
230#endif
231
232
233/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
234searched linearly. Put all the names into a single string, in order to reduce
235the number of relocations when a shared library is dynamically linked. The
236string is built from string macros so that it works in UTF-8 mode on EBCDIC
237platforms. */
238
239typedef struct verbitem {
240 int len; /* Length of verb name */
241 int op; /* Op when no arg, or -1 if arg mandatory */
242 int op_arg; /* Op when arg present, or -1 if not allowed */
243} verbitem;
244
245static const char verbnames[] =
246 "\0" /* Empty name is a shorthand for MARK */
247 STRING_MARK0
248 STRING_ACCEPT0
249 STRING_COMMIT0
250 STRING_F0
251 STRING_FAIL0
252 STRING_PRUNE0
253 STRING_SKIP0
254 STRING_THEN;
255
256static const verbitem verbs[] = {
257 { 0, -1, OP_MARK },
258 { 4, -1, OP_MARK },
259 { 6, OP_ACCEPT, -1 },
260 { 6, OP_COMMIT, -1 },
261 { 1, OP_FAIL, -1 },
262 { 4, OP_FAIL, -1 },
263 { 5, OP_PRUNE, OP_PRUNE_ARG },
264 { 4, OP_SKIP, OP_SKIP_ARG },
265 { 4, OP_THEN, OP_THEN_ARG }
266};
267
268static const int verbcount = sizeof(verbs)/sizeof(verbitem);
269
270
271/* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
272another regex library. */
273
274static const pcre_uchar sub_start_of_word[] = {
275 CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
276 CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
277
278static const pcre_uchar sub_end_of_word[] = {
279 CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
280 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
281 CHAR_RIGHT_PARENTHESIS, '\0' };
282
283
284/* Tables of names of POSIX character classes and their lengths. The names are
285now all in a single string, to reduce the number of relocations when a shared
286library is dynamically loaded. The list of lengths is terminated by a zero
287length entry. The first three must be alpha, lower, upper, as this is assumed
288for handling case independence. The indices for graph, print, and punct are
289needed, so identify them. */
290
291static const char posix_names[] =
292 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
293 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
294 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
295 STRING_word0 STRING_xdigit;
296
297static const pcre_uint8 posix_name_lengths[] = {
298 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
299
300#define PC_GRAPH 8
301#define PC_PRINT 9
302#define PC_PUNCT 10
303
304
305/* Table of class bit maps for each POSIX class. Each class is formed from a
306base map, with an optional addition or removal of another map. Then, for some
307classes, there is some additional tweaking: for [:blank:] the vertical space
308characters are removed, and for [:alpha:] and [:alnum:] the underscore
309character is removed. The triples in the table consist of the base map offset,
310second map offset or -1 if no second map, and a non-negative value for map
311addition or a negative value for map subtraction (if there are two maps). The
312absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
313remove vertical space characters, 2 => remove underscore. */
314
315static const int posix_class_maps[] = {
316 cbit_word, cbit_digit, -2, /* alpha */
317 cbit_lower, -1, 0, /* lower */
318 cbit_upper, -1, 0, /* upper */
319 cbit_word, -1, 2, /* alnum - word without underscore */
320 cbit_print, cbit_cntrl, 0, /* ascii */
321 cbit_space, -1, 1, /* blank - a GNU extension */
322 cbit_cntrl, -1, 0, /* cntrl */
323 cbit_digit, -1, 0, /* digit */
324 cbit_graph, -1, 0, /* graph */
325 cbit_print, -1, 0, /* print */
326 cbit_punct, -1, 0, /* punct */
327 cbit_space, -1, 0, /* space */
328 cbit_word, -1, 0, /* word - a Perl extension */
329 cbit_xdigit,-1, 0 /* xdigit */
330};
331
332/* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
333Unicode property escapes. */
334
335#ifdef SUPPORT_UCP
336static const pcre_uchar string_PNd[] = {
337 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
338 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
339static const pcre_uchar string_pNd[] = {
340 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
341 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
342static const pcre_uchar string_PXsp[] = {
343 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
344 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
345static const pcre_uchar string_pXsp[] = {
346 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
347 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
348static const pcre_uchar string_PXwd[] = {
349 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
350 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
351static const pcre_uchar string_pXwd[] = {
352 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
353 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
354
355static const pcre_uchar *substitutes[] = {
356 string_PNd, /* \D */
357 string_pNd, /* \d */
358 string_PXsp, /* \S */ /* Xsp is Perl space, but from 8.34, Perl */
359 string_pXsp, /* \s */ /* space and POSIX space are the same. */
360 string_PXwd, /* \W */
361 string_pXwd /* \w */
362};
363
364/* The POSIX class substitutes must be in the order of the POSIX class names,
365defined above, and there are both positive and negative cases. NULL means no
366general substitute of a Unicode property escape (\p or \P). However, for some
367POSIX classes (e.g. graph, print, punct) a special property code is compiled
368directly. */
369
370static const pcre_uchar string_pL[] = {
371 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
372 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
373static const pcre_uchar string_pLl[] = {
374 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
375 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
376static const pcre_uchar string_pLu[] = {
377 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
378 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
379static const pcre_uchar string_pXan[] = {
380 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
381 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
382static const pcre_uchar string_h[] = {
383 CHAR_BACKSLASH, CHAR_h, '\0' };
384static const pcre_uchar string_pXps[] = {
385 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
386 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
387static const pcre_uchar string_PL[] = {
388 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
389 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
390static const pcre_uchar string_PLl[] = {
391 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
392 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
393static const pcre_uchar string_PLu[] = {
394 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
395 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
396static const pcre_uchar string_PXan[] = {
397 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
398 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
399static const pcre_uchar string_H[] = {
400 CHAR_BACKSLASH, CHAR_H, '\0' };
401static const pcre_uchar string_PXps[] = {
402 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
403 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
404
405static const pcre_uchar *posix_substitutes[] = {
406 string_pL, /* alpha */
407 string_pLl, /* lower */
408 string_pLu, /* upper */
409 string_pXan, /* alnum */
410 NULL, /* ascii */
411 string_h, /* blank */
412 NULL, /* cntrl */
413 string_pNd, /* digit */
414 NULL, /* graph */
415 NULL, /* print */
416 NULL, /* punct */
417 string_pXps, /* space */ /* Xps is POSIX space, but from 8.34 */
418 string_pXwd, /* word */ /* Perl and POSIX space are the same */
419 NULL, /* xdigit */
420 /* Negated cases */
421 string_PL, /* ^alpha */
422 string_PLl, /* ^lower */
423 string_PLu, /* ^upper */
424 string_PXan, /* ^alnum */
425 NULL, /* ^ascii */
426 string_H, /* ^blank */
427 NULL, /* ^cntrl */
428 string_PNd, /* ^digit */
429 NULL, /* ^graph */
430 NULL, /* ^print */
431 NULL, /* ^punct */
432 string_PXps, /* ^space */ /* Xps is POSIX space, but from 8.34 */
433 string_PXwd, /* ^word */ /* Perl and POSIX space are the same */
434 NULL /* ^xdigit */
435};
436#define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
437#endif
438
439#define STRING(a) # a
440#define XSTRING(s) STRING(s)
441
442/* The texts of compile-time error messages. These are "char *" because they
443are passed to the outside world. Do not ever re-use any error number, because
444they are documented. Always add a new error instead. Messages marked DEAD below
445are no longer used. This used to be a table of strings, but in order to reduce
446the number of relocations needed when a shared library is loaded dynamically,
447it is now one long string. We cannot use a table of offsets, because the
448lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
449simply count through to the one we want - this isn't a performance issue
450because these strings are used only when there is a compilation error.
451
452Each substring ends with \0 to insert a null character. This includes the final
453substring, so that the whole string ends with \0\0, which can be detected when
454counting through. */
455
456static const char error_texts[] =
457 "no error\0"
458 "\\ at end of pattern\0"
459 "\\c at end of pattern\0"
460 "unrecognized character follows \\\0"
461 "numbers out of order in {} quantifier\0"
462 /* 5 */
463 "number too big in {} quantifier\0"
464 "missing terminating ] for character class\0"
465 "invalid escape sequence in character class\0"
466 "range out of order in character class\0"
467 "nothing to repeat\0"
468 /* 10 */
469 "internal error: invalid forward reference offset\0"
470 "internal error: unexpected repeat\0"
471 "unrecognized character after (? or (?-\0"
472 "POSIX named classes are supported only within a class\0"
473 "missing )\0"
474 /* 15 */
475 "reference to non-existent subpattern\0"
476 "erroffset passed as NULL\0"
477 "unknown option bit(s) set\0"
478 "missing ) after comment\0"
479 "parentheses nested too deeply\0" /** DEAD **/
480 /* 20 */
481 "regular expression is too large\0"
482 "failed to get memory\0"
483 "unmatched parentheses\0"
484 "internal error: code overflow\0"
485 "unrecognized character after (?<\0"
486 /* 25 */
487 "lookbehind assertion is not fixed length\0"
488 "malformed number or name after (?(\0"
489 "conditional group contains more than two branches\0"
490 "assertion expected after (?( or (?(?C)\0"
491 "(?R or (?[+-]digits must be followed by )\0"
492 /* 30 */
493 "unknown POSIX class name\0"
494 "POSIX collating elements are not supported\0"
495 "this version of PCRE is compiled without UTF support\0"
496 "spare error\0" /** DEAD **/
497 "character value in \\x{} or \\o{} is too large\0"
498 /* 35 */
499 "invalid condition (?(0)\0"
500 "\\C not allowed in lookbehind assertion\0"
501 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
502 "number after (?C is > 255\0"
503 "closing ) for (?C expected\0"
504 /* 40 */
505 "recursive call could loop indefinitely\0"
506 "unrecognized character after (?P\0"
507 "syntax error in subpattern name (missing terminator)\0"
508 "two named subpatterns have the same name\0"
509 "invalid UTF-8 string\0"
510 /* 45 */
511 "support for \\P, \\p, and \\X has not been compiled\0"
512 "malformed \\P or \\p sequence\0"
513 "unknown property name after \\P or \\p\0"
514 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
515 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
516 /* 50 */
517 "repeated subpattern is too long\0" /** DEAD **/
518 "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
519 "internal error: overran compiling workspace\0"
520 "internal error: previously-checked referenced subpattern not found\0"
521 "DEFINE group contains more than one branch\0"
522 /* 55 */
523 "repeating a DEFINE group is not allowed\0" /** DEAD **/
524 "inconsistent NEWLINE options\0"
525 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
526 "a numbered reference must not be zero\0"
527 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
528 /* 60 */
529 "(*VERB) not recognized or malformed\0"
530 "number is too big\0"
531 "subpattern name expected\0"
532 "digit expected after (?+\0"
533 "] is an invalid data character in JavaScript compatibility mode\0"
534 /* 65 */
535 "different names for subpatterns of the same number are not allowed\0"
536 "(*MARK) must have an argument\0"
537 "this version of PCRE is not compiled with Unicode property support\0"
538#ifndef EBCDIC
539 "\\c must be followed by an ASCII character\0"
540#else
541 "\\c must be followed by a letter or one of [\\]^_?\0"
542#endif
543 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
544 /* 70 */
545 "internal error: unknown opcode in find_fixedlength()\0"
546 "\\N is not supported in a class\0"
547 "too many forward references\0"
548 "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
549 "invalid UTF-16 string\0"
550 /* 75 */
551 "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
552 "character value in \\u.... sequence is too large\0"
553 "invalid UTF-32 string\0"
554 "setting UTF is disabled by the application\0"
555 "non-hex character in \\x{} (closing brace missing?)\0"
556 /* 80 */
557 "non-octal character in \\o{} (closing brace missing?)\0"
558 "missing opening brace after \\o\0"
559 "parentheses are too deeply nested\0"
560 "invalid range in character class\0"
561 "group name must start with a non-digit\0"
562 /* 85 */
563 "parentheses are too deeply nested (stack check)\0"
564 "digits missing in \\x{} or \\o{}\0"
565 "regular expression is too complicated\0"
566 ;
567
568/* Table to identify digits and hex digits. This is used when compiling
569patterns. Note that the tables in chartables are dependent on the locale, and
570may mark arbitrary characters as digits - but the PCRE compiling code expects
571to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
572a private table here. It costs 256 bytes, but it is a lot faster than doing
573character value tests (at least in some simple cases I timed), and in some
574applications one wants PCRE to compile efficiently as well as match
575efficiently.
576
577For convenience, we use the same bit definitions as in chartables:
578
579 0x04 decimal digit
580 0x08 hexadecimal digit
581
582Then we can use ctype_digit and ctype_xdigit in the code. */
583
584/* Using a simple comparison for decimal numbers rather than a memory read
585is much faster, and the resulting code is simpler (the compiler turns it
586into a subtraction and unsigned comparison). */
587
588#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
589
590#ifndef EBCDIC
591
592/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
593UTF-8 mode. */
594
595static const pcre_uint8 digitab[] =
596 {
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
603 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
604 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
605 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
609 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
617 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
618 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
619 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
624 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
625 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
626 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
627 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
628 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
629
630#else
631
632/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
633
634static const pcre_uint8 digitab[] =
635 {
636 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
637 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
638 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
645 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
646 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
647 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
648 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
649 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
650 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
651 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
652 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
653 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
654 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
655 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
656 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
657 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
658 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
659 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
660 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
662 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
664 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
665 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
666 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
667 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
668
669static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
670 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
671 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
672 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
673 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
674 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
676 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
677 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
678 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
679 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
680 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
681 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
682 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
683 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
684 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
685 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
686 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
687 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
688 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
689 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
690 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
691 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
692 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
693 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
694 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
695 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
696 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
697 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
698 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
699 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
700 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
701 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
702#endif
703
704
705/* This table is used to check whether auto-possessification is possible
706between adjacent character-type opcodes. The left-hand (repeated) opcode is
707used to select the row, and the right-hand opcode is use to select the column.
708A value of 1 means that auto-possessification is OK. For example, the second
709value in the first row means that \D+\d can be turned into \D++\d.
710
711The Unicode property types (\P and \p) have to be present to fill out the table
712because of what their opcode values are, but the table values should always be
713zero because property types are handled separately in the code. The last four
714columns apply to items that cannot be repeated, so there is no need to have
715rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
716*not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
717
718#define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
719#define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
720
721static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
722/* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */
723 { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */
724 { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */
725 { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */
726 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */
727 { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */
728 { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */
729 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */
730 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */
731 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */
732 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */
733 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */
734 { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */
735 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */
736 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */
737 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */
738 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */
739 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */
740};
741
742
743/* This table is used to check whether auto-possessification is possible
744between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
745left-hand (repeated) opcode is used to select the row, and the right-hand
746opcode is used to select the column. The values are as follows:
747
748 0 Always return FALSE (never auto-possessify)
749 1 Character groups are distinct (possessify if both are OP_PROP)
750 2 Check character categories in the same group (general or particular)
751 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
752
753 4 Check left general category vs right particular category
754 5 Check right general category vs left particular category
755
756 6 Left alphanum vs right general category
757 7 Left space vs right general category
758 8 Left word vs right general category
759
760 9 Right alphanum vs left general category
761 10 Right space vs left general category
762 11 Right word vs left general category
763
764 12 Left alphanum vs right particular category
765 13 Left space vs right particular category
766 14 Left word vs right particular category
767
768 15 Right alphanum vs left particular category
769 16 Right space vs left particular category
770 17 Right word vs left particular category
771*/
772
773static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
774/* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
775 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
776 { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
777 { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
778 { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
779 { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
780 { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
781 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
782 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
783 { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
784 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
785 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
786};
787
788/* This table is used to check whether auto-possessification is possible
789between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
790specifies a general category and the other specifies a particular category. The
791row is selected by the general category and the column by the particular
792category. The value is 1 if the particular category is not part of the general
793category. */
794
795static const pcre_uint8 catposstab[7][30] = {
796/* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
797 { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */
798 { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */
799 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */
800 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
801 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */
802 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */
803 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */
804};
805
806/* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
807a general or particular category. The properties in each row are those
808that apply to the character set in question. Duplication means that a little
809unnecessary work is done when checking, but this keeps things much simpler
810because they can all use the same code. For more details see the comment where
811this table is used.
812
813Note: SPACE and PXSPACE used to be different because Perl excluded VT from
814"space", but from Perl 5.18 it's included, so both categories are treated the
815same here. */
816
817static const pcre_uint8 posspropstab[3][4] = {
818 { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */
819 { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */
820 { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
821};
822
823/* This table is used when converting repeating opcodes into possessified
824versions as a result of an explicit possessive quantifier such as ++. A zero
825value means there is no possessified version - in those cases the item in
826question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
827because all relevant opcodes are less than that. */
828
829static const pcre_uint8 opcode_possessify[] = {
830 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
831 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
832
833 0, /* NOTI */
834 OP_POSSTAR, 0, /* STAR, MINSTAR */
835 OP_POSPLUS, 0, /* PLUS, MINPLUS */
836 OP_POSQUERY, 0, /* QUERY, MINQUERY */
837 OP_POSUPTO, 0, /* UPTO, MINUPTO */
838 0, /* EXACT */
839 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
840
841 OP_POSSTARI, 0, /* STARI, MINSTARI */
842 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
843 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
844 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
845 0, /* EXACTI */
846 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
847
848 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
849 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
850 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
851 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
852 0, /* NOTEXACT */
853 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
854
855 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
856 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
857 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
858 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
859 0, /* NOTEXACTI */
860 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
861
862 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
863 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
864 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
865 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
866 0, /* TYPEEXACT */
867 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
868
869 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
870 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
871 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
872 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
873 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
874
875 0, 0, 0, /* CLASS, NCLASS, XCLASS */
876 0, 0, /* REF, REFI */
877 0, 0, /* DNREF, DNREFI */
878 0, 0 /* RECURSE, CALLOUT */
879};
880
881
882
883/*************************************************
884* Find an error text *
885*************************************************/
886
887/* The error texts are now all in one long string, to save on relocations. As
888some of the text is of unknown length, we can't use a table of offsets.
889Instead, just count through the strings. This is not a performance issue
890because it happens only when there has been a compilation error.
891
892Argument: the error number
893Returns: pointer to the error string
894*/
895
896static const char *
897find_error_text(int n)
898{
899const char *s = error_texts;
900for (; n > 0; n--)
901 {
902 while (*s++ != CHAR_NULL) {};
903 if (*s == CHAR_NULL) return "Error text not found (please report)";
904 }
905return s;
906}
907
908
909
910/*************************************************
911* Expand the workspace *
912*************************************************/
913
914/* This function is called during the second compiling phase, if the number of
915forward references fills the existing workspace, which is originally a block on
916the stack. A larger block is obtained from malloc() unless the ultimate limit
917has been reached or the increase will be rather small.
918
919Argument: pointer to the compile data block
920Returns: 0 if all went well, else an error number
921*/
922
923static int
924expand_workspace(compile_data *cd)
925{
926pcre_uchar *newspace;
927int newsize = cd->workspace_size * 2;
928
929if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
930if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
931 newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
932 return ERR72;
933
934newspace = (PUBL(malloc))(IN_UCHARS(newsize));
935if (newspace == NULL) return ERR21;
936memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
937cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
938if (cd->workspace_size > COMPILE_WORK_SIZE)
939 (PUBL(free))((void *)cd->start_workspace);
940cd->start_workspace = newspace;
941cd->workspace_size = newsize;
942return 0;
943}
944
945
946
947/*************************************************
948* Check for counted repeat *
949*************************************************/
950
951/* This function is called when a '{' is encountered in a place where it might
952start a quantifier. It looks ahead to see if it really is a quantifier or not.
953It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
954where the ddds are digits.
955
956Arguments:
957 p pointer to the first char after '{'
958
959Returns: TRUE or FALSE
960*/
961
962static BOOL
963is_counted_repeat(const pcre_uchar *p)
964{
965if (!IS_DIGIT(*p)) return FALSE;
966p++;
967while (IS_DIGIT(*p)) p++;
968if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
969
970if (*p++ != CHAR_COMMA) return FALSE;
971if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
972
973if (!IS_DIGIT(*p)) return FALSE;
974p++;
975while (IS_DIGIT(*p)) p++;
976
977return (*p == CHAR_RIGHT_CURLY_BRACKET);
978}
979
980
981
982/*************************************************
983* Handle escapes *
984*************************************************/
985
986/* This function is called when a \ has been encountered. It either returns a
987positive value for a simple escape such as \n, or 0 for a data character which
988will be placed in chptr. A backreference to group n is returned as negative n.
989When UTF-8 is enabled, a positive value greater than 255 may be returned in
990chptr. On entry, ptr is pointing at the \. On exit, it is on the final
991character of the escape sequence.
992
993Arguments:
994 ptrptr points to the pattern position pointer
995 chptr points to a returned data character
996 errorcodeptr points to the errorcode variable
997 bracount number of previous extracting brackets
998 options the options bits
999 isclass TRUE if inside a character class
1000
1001Returns: zero => a data character
1002 positive => a special escape sequence
1003 negative => a back reference
1004 on error, errorcodeptr is set
1005*/
1006
1007static int
1008check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
1009 int bracount, int options, BOOL isclass)
1010{
1011/* PCRE_UTF16 has the same value as PCRE_UTF8. */
1012BOOL utf = (options & PCRE_UTF8) != 0;
1013const pcre_uchar *ptr = *ptrptr + 1;
1014pcre_uint32 c;
1015int escape = 0;
1016int i;
1017
1018GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1019ptr--; /* Set pointer back to the last byte */
1020
1021/* If backslash is at the end of the pattern, it's an error. */
1022
1023if (c == CHAR_NULL) *errorcodeptr = ERR1;
1024
1025/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
1026in a table. A non-zero result is something that can be returned immediately.
1027Otherwise further processing may be required. */
1028
1029#ifndef EBCDIC /* ASCII/UTF-8 coding */
1030/* Not alphanumeric */
1031else if (c < CHAR_0 || c > CHAR_z) {}
1032else if ((i = escapes[c - CHAR_0]) != 0)
1033 { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1034
1035#else /* EBCDIC coding */
1036/* Not alphanumeric */
1037else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1038else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1039#endif
1040
1041/* Escapes that need further processing, or are illegal. */
1042
1043else
1044 {
1045 const pcre_uchar *oldptr;
1046 BOOL braced, negated, overflow;
1047 int s;
1048
1049 switch (c)
1050 {
1051 /* A number of Perl escapes are not handled by PCRE. We give an explicit
1052 error. */
1053
1054 case CHAR_l:
1055 case CHAR_L:
1056 *errorcodeptr = ERR37;
1057 break;
1058
1059 case CHAR_u:
1060 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1061 {
1062 /* In JavaScript, \u must be followed by four hexadecimal numbers.
1063 Otherwise it is a lowercase u letter. */
1064 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1065 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1066 && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1067 && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1068 {
1069 c = 0;
1070 for (i = 0; i < 4; ++i)
1071 {
1072 register pcre_uint32 cc = *(++ptr);
1073#ifndef EBCDIC /* ASCII/UTF-8 coding */
1074 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1075 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1076#else /* EBCDIC coding */
1077 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1078 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1079#endif
1080 }
1081
1082#if defined COMPILE_PCRE8
1083 if (c > (utf ? 0x10ffffU : 0xffU))
1084#elif defined COMPILE_PCRE16
1085 if (c > (utf ? 0x10ffffU : 0xffffU))
1086#elif defined COMPILE_PCRE32
1087 if (utf && c > 0x10ffffU)
1088#endif
1089 {
1090 *errorcodeptr = ERR76;
1091 }
1092 else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1093 }
1094 }
1095 else
1096 *errorcodeptr = ERR37;
1097 break;
1098
1099 case CHAR_U:
1100 /* In JavaScript, \U is an uppercase U letter. */
1101 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1102 break;
1103
1104 /* In a character class, \g is just a literal "g". Outside a character
1105 class, \g must be followed by one of a number of specific things:
1106
1107 (1) A number, either plain or braced. If positive, it is an absolute
1108 backreference. If negative, it is a relative backreference. This is a Perl
1109 5.10 feature.
1110
1111 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1112 is part of Perl's movement towards a unified syntax for back references. As
1113 this is synonymous with \k{name}, we fudge it up by pretending it really
1114 was \k.
1115
1116 (3) For Oniguruma compatibility we also support \g followed by a name or a
1117 number either in angle brackets or in single quotes. However, these are
1118 (possibly recursive) subroutine calls, _not_ backreferences. Just return
1119 the ESC_g code (cf \k). */
1120
1121 case CHAR_g:
1122 if (isclass) break;
1123 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1124 {
1125 escape = ESC_g;
1126 break;
1127 }
1128
1129 /* Handle the Perl-compatible cases */
1130
1131 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1132 {
1133 const pcre_uchar *p;
1134 for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1135 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1136 if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1137 {
1138 escape = ESC_k;
1139 break;
1140 }
1141 braced = TRUE;
1142 ptr++;
1143 }
1144 else braced = FALSE;
1145
1146 if (ptr[1] == CHAR_MINUS)
1147 {
1148 negated = TRUE;
1149 ptr++;
1150 }
1151 else negated = FALSE;
1152
1153 /* The integer range is limited by the machine's int representation. */
1154 s = 0;
1155 overflow = FALSE;
1156 while (IS_DIGIT(ptr[1]))
1157 {
1158 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1159 {
1160 overflow = TRUE;
1161 break;
1162 }
1163 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1164 }
1165 if (overflow) /* Integer overflow */
1166 {
1167 while (IS_DIGIT(ptr[1]))
1168 ptr++;
1169 *errorcodeptr = ERR61;
1170 break;
1171 }
1172
1173 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1174 {
1175 *errorcodeptr = ERR57;
1176 break;
1177 }
1178
1179 if (s == 0)
1180 {
1181 *errorcodeptr = ERR58;
1182 break;
1183 }
1184
1185 if (negated)
1186 {
1187 if (s > bracount)
1188 {
1189 *errorcodeptr = ERR15;
1190 break;
1191 }
1192 s = bracount - (s - 1);
1193 }
1194
1195 escape = -s;
1196 break;
1197
1198 /* The handling of escape sequences consisting of a string of digits
1199 starting with one that is not zero is not straightforward. Perl has changed
1200 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1201 recommended to avoid the ambiguities in the old syntax.
1202
1203 Outside a character class, the digits are read as a decimal number. If the
1204 number is less than 8 (used to be 10), or if there are that many previous
1205 extracting left brackets, then it is a back reference. Otherwise, up to
1206 three octal digits are read to form an escaped byte. Thus \123 is likely to
1207 be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1208 the octal value is greater than 377, the least significant 8 bits are
1209 taken. \8 and \9 are treated as the literal characters 8 and 9.
1210
1211 Inside a character class, \ followed by a digit is always either a literal
1212 8 or 9 or an octal number. */
1213
1214 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1215 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1216
1217 if (!isclass)
1218 {
1219 oldptr = ptr;
1220 /* The integer range is limited by the machine's int representation. */
1221 s = (int)(c -CHAR_0);
1222 overflow = FALSE;
1223 while (IS_DIGIT(ptr[1]))
1224 {
1225 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1226 {
1227 overflow = TRUE;
1228 break;
1229 }
1230 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1231 }
1232 if (overflow) /* Integer overflow */
1233 {
1234 while (IS_DIGIT(ptr[1]))
1235 ptr++;
1236 *errorcodeptr = ERR61;
1237 break;
1238 }
1239 if (s < 8 || s <= bracount) /* Check for back reference */
1240 {
1241 escape = -s;
1242 break;
1243 }
1244 ptr = oldptr; /* Put the pointer back and fall through */
1245 }
1246
1247 /* Handle a digit following \ when the number is not a back reference. If
1248 the first digit is 8 or 9, Perl used to generate a binary zero byte and
1249 then treat the digit as a following literal. At least by Perl 5.18 this
1250 changed so as not to insert the binary zero. */
1251
1252 if ((c = *ptr) >= CHAR_8) break;
1253
1254 /* Fall through with a digit less than 8 */
1255
1256 /* \0 always starts an octal number, but we may drop through to here with a
1257 larger first octal digit. The original code used just to take the least
1258 significant 8 bits of octal numbers (I think this is what early Perls used
1259 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1260 but no more than 3 octal digits. */
1261
1262 case CHAR_0:
1263 c -= CHAR_0;
1264 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1265 c = c * 8 + *(++ptr) - CHAR_0;
1266#ifdef COMPILE_PCRE8
1267 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1268#endif
1269 break;
1270
1271 /* \o is a relatively new Perl feature, supporting a more general way of
1272 specifying character codes in octal. The only supported form is \o{ddd}. */
1273
1274 case CHAR_o:
1275 if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1276 if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
1277 {
1278 ptr += 2;
1279 c = 0;
1280 overflow = FALSE;
1281 while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1282 {
1283 register pcre_uint32 cc = *ptr++;
1284 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1285#ifdef COMPILE_PCRE32
1286 if (c >= 0x20000000l) { overflow = TRUE; break; }
1287#endif
1288 c = (c << 3) + cc - CHAR_0 ;
1289#if defined COMPILE_PCRE8
1290 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1291#elif defined COMPILE_PCRE16
1292 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1293#elif defined COMPILE_PCRE32
1294 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1295#endif
1296 }
1297 if (overflow)
1298 {
1299 while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1300 *errorcodeptr = ERR34;
1301 }
1302 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1303 {
1304 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1305 }
1306 else *errorcodeptr = ERR80;
1307 }
1308 break;
1309
1310 /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1311 numbers. Otherwise it is a lowercase x letter. */
1312
1313 case CHAR_x:
1314 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1315 {
1316 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1317 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1318 {
1319 c = 0;
1320 for (i = 0; i < 2; ++i)
1321 {
1322 register pcre_uint32 cc = *(++ptr);
1323#ifndef EBCDIC /* ASCII/UTF-8 coding */
1324 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1325 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1326#else /* EBCDIC coding */
1327 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1328 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1329#endif
1330 }
1331 }
1332 } /* End JavaScript handling */
1333
1334 /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1335 greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1336 digits. If not, { used to be treated as a data character. However, Perl
1337 seems to read hex digits up to the first non-such, and ignore the rest, so
1338 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1339 now gives an error. */
1340
1341 else
1342 {
1343 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1344 {
1345 ptr += 2;
1346 if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1347 {
1348 *errorcodeptr = ERR86;
1349 break;
1350 }
1351 c = 0;
1352 overflow = FALSE;
1353 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1354 {
1355 register pcre_uint32 cc = *ptr++;
1356 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1357
1358#ifdef COMPILE_PCRE32
1359 if (c >= 0x10000000l) { overflow = TRUE; break; }
1360#endif
1361
1362#ifndef EBCDIC /* ASCII/UTF-8 coding */
1363 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1364 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1365#else /* EBCDIC coding */
1366 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1367 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1368#endif
1369
1370#if defined COMPILE_PCRE8
1371 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1372#elif defined COMPILE_PCRE16
1373 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1374#elif defined COMPILE_PCRE32
1375 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1376#endif
1377 }
1378
1379 if (overflow)
1380 {
1381 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1382 *errorcodeptr = ERR34;
1383 }
1384
1385 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1386 {
1387 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1388 }
1389
1390 /* If the sequence of hex digits does not end with '}', give an error.
1391 We used just to recognize this construct and fall through to the normal
1392 \x handling, but nowadays Perl gives an error, which seems much more
1393 sensible, so we do too. */
1394
1395 else *errorcodeptr = ERR79;
1396 } /* End of \x{} processing */
1397
1398 /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1399
1400 else
1401 {
1402 c = 0;
1403 while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1404 {
1405 pcre_uint32 cc; /* Some compilers don't like */
1406 cc = *(++ptr); /* ++ in initializers */
1407#ifndef EBCDIC /* ASCII/UTF-8 coding */
1408 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1409 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1410#else /* EBCDIC coding */
1411 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1412 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1413#endif
1414 }
1415 } /* End of \xdd handling */
1416 } /* End of Perl-style \x handling */
1417 break;
1418
1419 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1420 An error is given if the byte following \c is not an ASCII character. This
1421 coding is ASCII-specific, but then the whole concept of \cx is
1422 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1423
1424 case CHAR_c:
1425 c = *(++ptr);
1426 if (c == CHAR_NULL)
1427 {
1428 *errorcodeptr = ERR2;
1429 break;
1430 }
1431#ifndef EBCDIC /* ASCII/UTF-8 coding */
1432 if (c > 127) /* Excludes all non-ASCII in either mode */
1433 {
1434 *errorcodeptr = ERR68;
1435 break;
1436 }
1437 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1438 c ^= 0x40;
1439#else /* EBCDIC coding */
1440 if (c >= CHAR_a && c <= CHAR_z) c += 64;
1441 if (c == CHAR_QUESTION_MARK)
1442 c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
1443 else
1444 {
1445 for (i = 0; i < 32; i++)
1446 {
1447 if (c == ebcdic_escape_c[i]) break;
1448 }
1449 if (i < 32) c = i; else *errorcodeptr = ERR68;
1450 }
1451#endif
1452 break;
1453
1454 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1455 other alphanumeric following \ is an error if PCRE_EXTRA was set;
1456 otherwise, for Perl compatibility, it is a literal. This code looks a bit
1457 odd, but there used to be some cases other than the default, and there may
1458 be again in future, so I haven't "optimized" it. */
1459
1460 default:
1461 if ((options & PCRE_EXTRA) != 0) switch(c)
1462 {
1463 default:
1464 *errorcodeptr = ERR3;
1465 break;
1466 }
1467 break;
1468 }
1469 }
1470
1471/* Perl supports \N{name} for character names, as well as plain \N for "not
1472newline". PCRE does not support \N{name}. However, it does support
1473quantification such as \N{2,3}. */
1474
1475if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1476 !is_counted_repeat(ptr+2))
1477 *errorcodeptr = ERR37;
1478
1479/* If PCRE_UCP is set, we change the values for \d etc. */
1480
1481if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1482 escape += (ESC_DU - ESC_D);
1483
1484/* Set the pointer to the final character before returning. */
1485
1486*ptrptr = ptr;
1487*chptr = c;
1488return escape;
1489}
1490
1491
1492
1493#ifdef SUPPORT_UCP
1494/*************************************************
1495* Handle \P and \p *
1496*************************************************/
1497
1498/* This function is called after \P or \p has been encountered, provided that
1499PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1500pointing at the P or p. On exit, it is pointing at the final character of the
1501escape sequence.
1502
1503Argument:
1504 ptrptr points to the pattern position pointer
1505 negptr points to a boolean that is set TRUE for negation else FALSE
1506 ptypeptr points to an unsigned int that is set to the type value
1507 pdataptr points to an unsigned int that is set to the detailed property value
1508 errorcodeptr points to the error code variable
1509
1510Returns: TRUE if the type value was found, or FALSE for an invalid type
1511*/
1512
1513static BOOL
1514get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1515 unsigned int *pdataptr, int *errorcodeptr)
1516{
1517pcre_uchar c;
1518int i, bot, top;
1519const pcre_uchar *ptr = *ptrptr;
1520pcre_uchar name[32];
1521
1522c = *(++ptr);
1523if (c == CHAR_NULL) goto ERROR_RETURN;
1524
1525*negptr = FALSE;
1526
1527/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1528negation. */
1529
1530if (c == CHAR_LEFT_CURLY_BRACKET)
1531 {
1532 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1533 {
1534 *negptr = TRUE;
1535 ptr++;
1536 }
1537 for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1538 {
1539 c = *(++ptr);
1540 if (c == CHAR_NULL) goto ERROR_RETURN;
1541 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1542 name[i] = c;
1543 }
1544 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1545 name[i] = 0;
1546 }
1547
1548/* Otherwise there is just one following character */
1549
1550else
1551 {
1552 name[0] = c;
1553 name[1] = 0;
1554 }
1555
1556*ptrptr = ptr;
1557
1558/* Search for a recognized property name using binary chop */
1559
1560bot = 0;
1561top = PRIV(utt_size);
1562
1563while (bot < top)
1564 {
1565 int r;
1566 i = (bot + top) >> 1;
1567 r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1568 if (r == 0)
1569 {
1570 *ptypeptr = PRIV(utt)[i].type;
1571 *pdataptr = PRIV(utt)[i].value;
1572 return TRUE;
1573 }
1574 if (r > 0) bot = i + 1; else top = i;
1575 }
1576
1577*errorcodeptr = ERR47;
1578*ptrptr = ptr;
1579return FALSE;
1580
1581ERROR_RETURN:
1582*errorcodeptr = ERR46;
1583*ptrptr = ptr;
1584return FALSE;
1585}
1586#endif
1587
1588
1589
1590/*************************************************
1591* Read repeat counts *
1592*************************************************/
1593
1594/* Read an item of the form {n,m} and return the values. This is called only
1595after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1596so the syntax is guaranteed to be correct, but we need to check the values.
1597
1598Arguments:
1599 p pointer to first char after '{'
1600 minp pointer to int for min
1601 maxp pointer to int for max
1602 returned as -1 if no max
1603 errorcodeptr points to error code variable
1604
1605Returns: pointer to '}' on success;
1606 current ptr on error, with errorcodeptr set non-zero
1607*/
1608
1609static const pcre_uchar *
1610read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1611{
1612int min = 0;
1613int max = -1;
1614
1615while (IS_DIGIT(*p))
1616 {
1617 min = min * 10 + (int)(*p++ - CHAR_0);
1618 if (min > 65535)
1619 {
1620 *errorcodeptr = ERR5;
1621 return p;
1622 }
1623 }
1624
1625if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1626 {
1627 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1628 {
1629 max = 0;
1630 while(IS_DIGIT(*p))
1631 {
1632 max = max * 10 + (int)(*p++ - CHAR_0);
1633 if (max > 65535)
1634 {
1635 *errorcodeptr = ERR5;
1636 return p;
1637 }
1638 }
1639 if (max < min)
1640 {
1641 *errorcodeptr = ERR4;
1642 return p;
1643 }
1644 }
1645 }
1646
1647*minp = min;
1648*maxp = max;
1649return p;
1650}
1651
1652
1653
1654/*************************************************
1655* Find first significant op code *
1656*************************************************/
1657
1658/* This is called by several functions that scan a compiled expression looking
1659for a fixed first character, or an anchoring op code etc. It skips over things
1660that do not influence this. For some calls, it makes sense to skip negative
1661forward and all backward assertions, and also the \b assertion; for others it
1662does not.
1663
1664Arguments:
1665 code pointer to the start of the group
1666 skipassert TRUE if certain assertions are to be skipped
1667
1668Returns: pointer to the first significant opcode
1669*/
1670
1671static const pcre_uchar*
1672first_significant_code(const pcre_uchar *code, BOOL skipassert)
1673{
1674for (;;)
1675 {
1676 switch ((int)*code)
1677 {
1678 case OP_ASSERT_NOT:
1679 case OP_ASSERTBACK:
1680 case OP_ASSERTBACK_NOT:
1681 if (!skipassert) return code;
1682 do code += GET(code, 1); while (*code == OP_ALT);
1683 code += PRIV(OP_lengths)[*code];
1684 break;
1685
1686 case OP_WORD_BOUNDARY:
1687 case OP_NOT_WORD_BOUNDARY:
1688 if (!skipassert) return code;
1689 /* Fall through */
1690
1691 case OP_CALLOUT:
1692 case OP_CREF:
1693 case OP_DNCREF:
1694 case OP_RREF:
1695 case OP_DNRREF:
1696 case OP_DEF:
1697 code += PRIV(OP_lengths)[*code];
1698 break;
1699
1700 default:
1701 return code;
1702 }
1703 }
1704/* Control never reaches here */
1705}
1706
1707
1708
1709/*************************************************
1710* Find the fixed length of a branch *
1711*************************************************/
1712
1713/* Scan a branch and compute the fixed length of subject that will match it,
1714if the length is fixed. This is needed for dealing with backward assertions.
1715In UTF8 mode, the result is in characters rather than bytes. The branch is
1716temporarily terminated with OP_END when this function is called.
1717
1718This function is called when a backward assertion is encountered, so that if it
1719fails, the error message can point to the correct place in the pattern.
1720However, we cannot do this when the assertion contains subroutine calls,
1721because they can be forward references. We solve this by remembering this case
1722and doing the check at the end; a flag specifies which mode we are running in.
1723
1724Arguments:
1725 code points to the start of the pattern (the bracket)
1726 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1727 atend TRUE if called when the pattern is complete
1728 cd the "compile data" structure
1729 recurses chain of recurse_check to catch mutual recursion
1730
1731Returns: the fixed length,
1732 or -1 if there is no fixed length,
1733 or -2 if \C was encountered (in UTF-8 mode only)
1734 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1735 or -4 if an unknown opcode was encountered (internal error)
1736*/
1737
1738static int
1739find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
1740 recurse_check *recurses)
1741{
1742int length = -1;
1743recurse_check this_recurse;
1744register int branchlength = 0;
1745register pcre_uchar *cc = code + 1 + LINK_SIZE;
1746
1747/* Scan along the opcodes for this branch. If we get to the end of the
1748branch, check the length against that of the other branches. */
1749
1750for (;;)
1751 {
1752 int d;
1753 pcre_uchar *ce, *cs;
1754 register pcre_uchar op = *cc;
1755
1756 switch (op)
1757 {
1758 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1759 OP_BRA (normal non-capturing bracket) because the other variants of these
1760 opcodes are all concerned with unlimited repeated groups, which of course
1761 are not of fixed length. */
1762
1763 case OP_CBRA:
1764 case OP_BRA:
1765 case OP_ONCE:
1766 case OP_ONCE_NC:
1767 case OP_COND:
1768 d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
1769 recurses);
1770 if (d < 0) return d;
1771 branchlength += d;
1772 do cc += GET(cc, 1); while (*cc == OP_ALT);
1773 cc += 1 + LINK_SIZE;
1774 break;
1775
1776 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1777 If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1778 an ALT. If it is END it's the end of the outer call. All can be handled by
1779 the same code. Note that we must not include the OP_KETRxxx opcodes here,
1780 because they all imply an unlimited repeat. */
1781
1782 case OP_ALT:
1783 case OP_KET:
1784 case OP_END:
1785 case OP_ACCEPT:
1786 case OP_ASSERT_ACCEPT:
1787 if (length < 0) length = branchlength;
1788 else if (length != branchlength) return -1;
1789 if (*cc != OP_ALT) return length;
1790 cc += 1 + LINK_SIZE;
1791 branchlength = 0;
1792 break;
1793
1794 /* A true recursion implies not fixed length, but a subroutine call may
1795 be OK. If the subroutine is a forward reference, we can't deal with
1796 it until the end of the pattern, so return -3. */
1797
1798 case OP_RECURSE:
1799 if (!atend) return -3;
1800 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1801 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1802 if (cc > cs && cc < ce) return -1; /* Recursion */
1803 else /* Check for mutual recursion */
1804 {
1805 recurse_check *r = recurses;
1806 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
1807 if (r != NULL) return -1; /* Mutual recursion */
1808 }
1809 this_recurse.prev = recurses;
1810 this_recurse.group = cs;
1811 d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
1812 if (d < 0) return d;
1813 branchlength += d;
1814 cc += 1 + LINK_SIZE;
1815 break;
1816
1817 /* Skip over assertive subpatterns */
1818
1819 case OP_ASSERT:
1820 case OP_ASSERT_NOT:
1821 case OP_ASSERTBACK:
1822 case OP_ASSERTBACK_NOT:
1823 do cc += GET(cc, 1); while (*cc == OP_ALT);
1824 cc += 1 + LINK_SIZE;
1825 break;
1826
1827 /* Skip over things that don't match chars */
1828
1829 case OP_MARK:
1830 case OP_PRUNE_ARG:
1831 case OP_SKIP_ARG:
1832 case OP_THEN_ARG:
1833 cc += cc[1] + PRIV(OP_lengths)[*cc];
1834 break;
1835
1836 case OP_CALLOUT:
1837 case OP_CIRC:
1838 case OP_CIRCM:
1839 case OP_CLOSE:
1840 case OP_COMMIT:
1841 case OP_CREF:
1842 case OP_DEF:
1843 case OP_DNCREF:
1844 case OP_DNRREF:
1845 case OP_DOLL:
1846 case OP_DOLLM:
1847 case OP_EOD:
1848 case OP_EODN:
1849 case OP_FAIL:
1850 case OP_NOT_WORD_BOUNDARY:
1851 case OP_PRUNE:
1852 case OP_REVERSE:
1853 case OP_RREF:
1854 case OP_SET_SOM:
1855 case OP_SKIP:
1856 case OP_SOD:
1857 case OP_SOM:
1858 case OP_THEN:
1859 case OP_WORD_BOUNDARY:
1860 cc += PRIV(OP_lengths)[*cc];
1861 break;
1862
1863 /* Handle literal characters */
1864
1865 case OP_CHAR:
1866 case OP_CHARI:
1867 case OP_NOT:
1868 case OP_NOTI:
1869 branchlength++;
1870 cc += 2;
1871#ifdef SUPPORT_UTF
1872 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1873#endif
1874 break;
1875
1876 /* Handle exact repetitions. The count is already in characters, but we
1877 need to skip over a multibyte character in UTF8 mode. */
1878
1879 case OP_EXACT:
1880 case OP_EXACTI:
1881 case OP_NOTEXACT:
1882 case OP_NOTEXACTI:
1883 branchlength += (int)GET2(cc,1);
1884 cc += 2 + IMM2_SIZE;
1885#ifdef SUPPORT_UTF
1886 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1887#endif
1888 break;
1889
1890 case OP_TYPEEXACT:
1891 branchlength += GET2(cc,1);
1892 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1893 cc += 2;
1894 cc += 1 + IMM2_SIZE + 1;
1895 break;
1896
1897 /* Handle single-char matchers */
1898
1899 case OP_PROP:
1900 case OP_NOTPROP:
1901 cc += 2;
1902 /* Fall through */
1903
1904 case OP_HSPACE:
1905 case OP_VSPACE:
1906 case OP_NOT_HSPACE:
1907 case OP_NOT_VSPACE:
1908 case OP_NOT_DIGIT:
1909 case OP_DIGIT:
1910 case OP_NOT_WHITESPACE:
1911 case OP_WHITESPACE:
1912 case OP_NOT_WORDCHAR:
1913 case OP_WORDCHAR:
1914 case OP_ANY:
1915 case OP_ALLANY:
1916 branchlength++;
1917 cc++;
1918 break;
1919
1920 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1921 otherwise \C is coded as OP_ALLANY. */
1922
1923 case OP_ANYBYTE:
1924 return -2;
1925
1926 /* Check a class for variable quantification */
1927
1928 case OP_CLASS:
1929 case OP_NCLASS:
1930#if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1931 case OP_XCLASS:
1932 /* The original code caused an unsigned overflow in 64 bit systems,
1933 so now we use a conditional statement. */
1934 if (op == OP_XCLASS)
1935 cc += GET(cc, 1);
1936 else
1937 cc += PRIV(OP_lengths)[OP_CLASS];
1938#else
1939 cc += PRIV(OP_lengths)[OP_CLASS];
1940#endif
1941
1942 switch (*cc)
1943 {
1944 case OP_CRSTAR:
1945 case OP_CRMINSTAR:
1946 case OP_CRPLUS:
1947 case OP_CRMINPLUS:
1948 case OP_CRQUERY:
1949 case OP_CRMINQUERY:
1950 case OP_CRPOSSTAR:
1951 case OP_CRPOSPLUS:
1952 case OP_CRPOSQUERY:
1953 return -1;
1954
1955 case OP_CRRANGE:
1956 case OP_CRMINRANGE:
1957 case OP_CRPOSRANGE:
1958 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1959 branchlength += (int)GET2(cc,1);
1960 cc += 1 + 2 * IMM2_SIZE;
1961 break;
1962
1963 default:
1964 branchlength++;
1965 }
1966 break;
1967
1968 /* Anything else is variable length */
1969
1970 case OP_ANYNL:
1971 case OP_BRAMINZERO:
1972 case OP_BRAPOS:
1973 case OP_BRAPOSZERO:
1974 case OP_BRAZERO:
1975 case OP_CBRAPOS:
1976 case OP_EXTUNI:
1977 case OP_KETRMAX:
1978 case OP_KETRMIN:
1979 case OP_KETRPOS:
1980 case OP_MINPLUS:
1981 case OP_MINPLUSI:
1982 case OP_MINQUERY:
1983 case OP_MINQUERYI:
1984 case OP_MINSTAR:
1985 case OP_MINSTARI:
1986 case OP_MINUPTO:
1987 case OP_MINUPTOI:
1988 case OP_NOTMINPLUS:
1989 case OP_NOTMINPLUSI:
1990 case OP_NOTMINQUERY:
1991 case OP_NOTMINQUERYI:
1992 case OP_NOTMINSTAR:
1993 case OP_NOTMINSTARI:
1994 case OP_NOTMINUPTO:
1995 case OP_NOTMINUPTOI:
1996 case OP_NOTPLUS:
1997 case OP_NOTPLUSI:
1998 case OP_NOTPOSPLUS:
1999 case OP_NOTPOSPLUSI:
2000 case OP_NOTPOSQUERY:
2001 case OP_NOTPOSQUERYI:
2002 case OP_NOTPOSSTAR:
2003 case OP_NOTPOSSTARI:
2004 case OP_NOTPOSUPTO:
2005 case OP_NOTPOSUPTOI:
2006 case OP_NOTQUERY:
2007 case OP_NOTQUERYI:
2008 case OP_NOTSTAR:
2009 case OP_NOTSTARI:
2010 case OP_NOTUPTO:
2011 case OP_NOTUPTOI:
2012 case OP_PLUS:
2013 case OP_PLUSI:
2014 case OP_POSPLUS:
2015 case OP_POSPLUSI:
2016 case OP_POSQUERY:
2017 case OP_POSQUERYI:
2018 case OP_POSSTAR:
2019 case OP_POSSTARI:
2020 case OP_POSUPTO:
2021 case OP_POSUPTOI:
2022 case OP_QUERY:
2023 case OP_QUERYI:
2024 case OP_REF:
2025 case OP_REFI:
2026 case OP_DNREF:
2027 case OP_DNREFI:
2028 case OP_SBRA:
2029 case OP_SBRAPOS:
2030 case OP_SCBRA:
2031 case OP_SCBRAPOS:
2032 case OP_SCOND:
2033 case OP_SKIPZERO:
2034 case OP_STAR:
2035 case OP_STARI:
2036 case OP_TYPEMINPLUS:
2037 case OP_TYPEMINQUERY:
2038 case OP_TYPEMINSTAR:
2039 case OP_TYPEMINUPTO:
2040 case OP_TYPEPLUS:
2041 case OP_TYPEPOSPLUS:
2042 case OP_TYPEPOSQUERY:
2043 case OP_TYPEPOSSTAR:
2044 case OP_TYPEPOSUPTO:
2045 case OP_TYPEQUERY:
2046 case OP_TYPESTAR:
2047 case OP_TYPEUPTO:
2048 case OP_UPTO:
2049 case OP_UPTOI:
2050 return -1;
2051
2052 /* Catch unrecognized opcodes so that when new ones are added they
2053 are not forgotten, as has happened in the past. */
2054
2055 default:
2056 return -4;
2057 }
2058 }
2059/* Control never gets here */
2060}
2061
2062
2063
2064/*************************************************
2065* Scan compiled regex for specific bracket *
2066*************************************************/
2067
2068/* This little function scans through a compiled pattern until it finds a
2069capturing bracket with the given number, or, if the number is negative, an
2070instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2071so that it can be called from pcre_study() when finding the minimum matching
2072length.
2073
2074Arguments:
2075 code points to start of expression
2076 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2077 number the required bracket number or negative to find a lookbehind
2078
2079Returns: pointer to the opcode for the bracket, or NULL if not found
2080*/
2081
2082const pcre_uchar *
2083PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2084{
2085for (;;)
2086 {
2087 register pcre_uchar c = *code;
2088
2089 if (c == OP_END) return NULL;
2090
2091 /* XCLASS is used for classes that cannot be represented just by a bit
2092 map. This includes negated single high-valued characters. The length in
2093 the table is zero; the actual length is stored in the compiled code. */
2094
2095 if (c == OP_XCLASS) code += GET(code, 1);
2096
2097 /* Handle recursion */
2098
2099 else if (c == OP_REVERSE)
2100 {
2101 if (number < 0) return (pcre_uchar *)code;
2102 code += PRIV(OP_lengths)[c];
2103 }
2104
2105 /* Handle capturing bracket */
2106
2107 else if (c == OP_CBRA || c == OP_SCBRA ||
2108 c == OP_CBRAPOS || c == OP_SCBRAPOS)
2109 {
2110 int n = (int)GET2(code, 1+LINK_SIZE);
2111 if (n == number) return (pcre_uchar *)code;
2112 code += PRIV(OP_lengths)[c];
2113 }
2114
2115 /* Otherwise, we can get the item's length from the table, except that for
2116 repeated character types, we have to test for \p and \P, which have an extra
2117 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2118 must add in its length. */
2119
2120 else
2121 {
2122 switch(c)
2123 {
2124 case OP_TYPESTAR:
2125 case OP_TYPEMINSTAR:
2126 case OP_TYPEPLUS:
2127 case OP_TYPEMINPLUS:
2128 case OP_TYPEQUERY:
2129 case OP_TYPEMINQUERY:
2130 case OP_TYPEPOSSTAR:
2131 case OP_TYPEPOSPLUS:
2132 case OP_TYPEPOSQUERY:
2133 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2134 break;
2135
2136 case OP_TYPEUPTO:
2137 case OP_TYPEMINUPTO:
2138 case OP_TYPEEXACT:
2139 case OP_TYPEPOSUPTO:
2140 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2141 code += 2;
2142 break;
2143
2144 case OP_MARK:
2145 case OP_PRUNE_ARG:
2146 case OP_SKIP_ARG:
2147 case OP_THEN_ARG:
2148 code += code[1];
2149 break;
2150 }
2151
2152 /* Add in the fixed length from the table */
2153
2154 code += PRIV(OP_lengths)[c];
2155
2156 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2157 a multi-byte character. The length in the table is a minimum, so we have to
2158 arrange to skip the extra bytes. */
2159
2160#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2161 if (utf) switch(c)
2162 {
2163 case OP_CHAR:
2164 case OP_CHARI:
2165 case OP_NOT:
2166 case OP_NOTI:
2167 case OP_EXACT:
2168 case OP_EXACTI:
2169 case OP_NOTEXACT:
2170 case OP_NOTEXACTI:
2171 case OP_UPTO:
2172 case OP_UPTOI:
2173 case OP_NOTUPTO:
2174 case OP_NOTUPTOI:
2175 case OP_MINUPTO:
2176 case OP_MINUPTOI:
2177 case OP_NOTMINUPTO:
2178 case OP_NOTMINUPTOI:
2179 case OP_POSUPTO:
2180 case OP_POSUPTOI:
2181 case OP_NOTPOSUPTO:
2182 case OP_NOTPOSUPTOI:
2183 case OP_STAR:
2184 case OP_STARI:
2185 case OP_NOTSTAR:
2186 case OP_NOTSTARI:
2187 case OP_MINSTAR:
2188 case OP_MINSTARI:
2189 case OP_NOTMINSTAR:
2190 case OP_NOTMINSTARI:
2191 case OP_POSSTAR:
2192 case OP_POSSTARI:
2193 case OP_NOTPOSSTAR:
2194 case OP_NOTPOSSTARI:
2195 case OP_PLUS:
2196 case OP_PLUSI:
2197 case OP_NOTPLUS:
2198 case OP_NOTPLUSI:
2199 case OP_MINPLUS:
2200 case OP_MINPLUSI:
2201 case OP_NOTMINPLUS:
2202 case OP_NOTMINPLUSI:
2203 case OP_POSPLUS:
2204 case OP_POSPLUSI:
2205 case OP_NOTPOSPLUS:
2206 case OP_NOTPOSPLUSI:
2207 case OP_QUERY:
2208 case OP_QUERYI:
2209 case OP_NOTQUERY:
2210 case OP_NOTQUERYI:
2211 case OP_MINQUERY:
2212 case OP_MINQUERYI:
2213 case OP_NOTMINQUERY:
2214 case OP_NOTMINQUERYI:
2215 case OP_POSQUERY:
2216 case OP_POSQUERYI:
2217 case OP_NOTPOSQUERY:
2218 case OP_NOTPOSQUERYI:
2219 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2220 break;
2221 }
2222#else
2223 (void)(utf); /* Keep compiler happy by referencing function argument */
2224#endif
2225 }
2226 }
2227}
2228
2229
2230
2231/*************************************************
2232* Scan compiled regex for recursion reference *
2233*************************************************/
2234
2235/* This little function scans through a compiled pattern until it finds an
2236instance of OP_RECURSE.
2237
2238Arguments:
2239 code points to start of expression
2240 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2241
2242Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2243*/
2244
2245static const pcre_uchar *
2246find_recurse(const pcre_uchar *code, BOOL utf)
2247{
2248for (;;)
2249 {
2250 register pcre_uchar c = *code;
2251 if (c == OP_END) return NULL;
2252 if (c == OP_RECURSE) return code;
2253
2254 /* XCLASS is used for classes that cannot be represented just by a bit
2255 map. This includes negated single high-valued characters. The length in
2256 the table is zero; the actual length is stored in the compiled code. */
2257
2258 if (c == OP_XCLASS) code += GET(code, 1);
2259
2260 /* Otherwise, we can get the item's length from the table, except that for
2261 repeated character types, we have to test for \p and \P, which have an extra
2262 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2263 must add in its length. */
2264
2265 else
2266 {
2267 switch(c)
2268 {
2269 case OP_TYPESTAR:
2270 case OP_TYPEMINSTAR:
2271 case OP_TYPEPLUS:
2272 case OP_TYPEMINPLUS:
2273 case OP_TYPEQUERY:
2274 case OP_TYPEMINQUERY:
2275 case OP_TYPEPOSSTAR:
2276 case OP_TYPEPOSPLUS:
2277 case OP_TYPEPOSQUERY:
2278 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2279 break;
2280
2281 case OP_TYPEPOSUPTO:
2282 case OP_TYPEUPTO:
2283 case OP_TYPEMINUPTO:
2284 case OP_TYPEEXACT:
2285 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2286 code += 2;
2287 break;
2288
2289 case OP_MARK:
2290 case OP_PRUNE_ARG:
2291 case OP_SKIP_ARG:
2292 case OP_THEN_ARG:
2293 code += code[1];
2294 break;
2295 }
2296
2297 /* Add in the fixed length from the table */
2298
2299 code += PRIV(OP_lengths)[c];
2300
2301 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2302 by a multi-byte character. The length in the table is a minimum, so we have
2303 to arrange to skip the extra bytes. */
2304
2305#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2306 if (utf) switch(c)
2307 {
2308 case OP_CHAR:
2309 case OP_CHARI:
2310 case OP_NOT:
2311 case OP_NOTI:
2312 case OP_EXACT:
2313 case OP_EXACTI:
2314 case OP_NOTEXACT:
2315 case OP_NOTEXACTI:
2316 case OP_UPTO:
2317 case OP_UPTOI:
2318 case OP_NOTUPTO:
2319 case OP_NOTUPTOI:
2320 case OP_MINUPTO:
2321 case OP_MINUPTOI:
2322 case OP_NOTMINUPTO:
2323 case OP_NOTMINUPTOI:
2324 case OP_POSUPTO:
2325 case OP_POSUPTOI:
2326 case OP_NOTPOSUPTO:
2327 case OP_NOTPOSUPTOI:
2328 case OP_STAR:
2329 case OP_STARI:
2330 case OP_NOTSTAR:
2331 case OP_NOTSTARI:
2332 case OP_MINSTAR:
2333 case OP_MINSTARI:
2334 case OP_NOTMINSTAR:
2335 case OP_NOTMINSTARI:
2336 case OP_POSSTAR:
2337 case OP_POSSTARI:
2338 case OP_NOTPOSSTAR:
2339 case OP_NOTPOSSTARI:
2340 case OP_PLUS:
2341 case OP_PLUSI:
2342 case OP_NOTPLUS:
2343 case OP_NOTPLUSI:
2344 case OP_MINPLUS:
2345 case OP_MINPLUSI:
2346 case OP_NOTMINPLUS:
2347 case OP_NOTMINPLUSI:
2348 case OP_POSPLUS:
2349 case OP_POSPLUSI:
2350 case OP_NOTPOSPLUS:
2351 case OP_NOTPOSPLUSI:
2352 case OP_QUERY:
2353 case OP_QUERYI:
2354 case OP_NOTQUERY:
2355 case OP_NOTQUERYI:
2356 case OP_MINQUERY:
2357 case OP_MINQUERYI:
2358 case OP_NOTMINQUERY:
2359 case OP_NOTMINQUERYI:
2360 case OP_POSQUERY:
2361 case OP_POSQUERYI:
2362 case OP_NOTPOSQUERY:
2363 case OP_NOTPOSQUERYI:
2364 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2365 break;
2366 }
2367#else
2368 (void)(utf); /* Keep compiler happy by referencing function argument */
2369#endif
2370 }
2371 }
2372}
2373
2374
2375
2376/*************************************************
2377* Scan compiled branch for non-emptiness *
2378*************************************************/
2379
2380/* This function scans through a branch of a compiled pattern to see whether it
2381can match the empty string or not. It is called from could_be_empty()
2382below and from compile_branch() when checking for an unlimited repeat of a
2383group that can match nothing. Note that first_significant_code() skips over
2384backward and negative forward assertions when its final argument is TRUE. If we
2385hit an unclosed bracket, we return "empty" - this means we've struck an inner
2386bracket whose current branch will already have been scanned.
2387
2388Arguments:
2389 code points to start of search
2390 endcode points to where to stop
2391 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2392 cd contains pointers to tables etc.
2393 recurses chain of recurse_check to catch mutual recursion
2394
2395Returns: TRUE if what is matched could be empty
2396*/
2397
2398static BOOL
2399could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2400 BOOL utf, compile_data *cd, recurse_check *recurses)
2401{
2402register pcre_uchar c;
2403recurse_check this_recurse;
2404
2405for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2406 code < endcode;
2407 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2408 {
2409 const pcre_uchar *ccode;
2410
2411 c = *code;
2412
2413 /* Skip over forward assertions; the other assertions are skipped by
2414 first_significant_code() with a TRUE final argument. */
2415
2416 if (c == OP_ASSERT)
2417 {
2418 do code += GET(code, 1); while (*code == OP_ALT);
2419 c = *code;
2420 continue;
2421 }
2422
2423 /* For a recursion/subroutine call, if its end has been reached, which
2424 implies a backward reference subroutine call, we can scan it. If it's a
2425 forward reference subroutine call, we can't. To detect forward reference
2426 we have to scan up the list that is kept in the workspace. This function is
2427 called only when doing the real compile, not during the pre-compile that
2428 measures the size of the compiled pattern. */
2429
2430 if (c == OP_RECURSE)
2431 {
2432 const pcre_uchar *scode = cd->start_code + GET(code, 1);
2433 const pcre_uchar *endgroup = scode;
2434 BOOL empty_branch;
2435
2436 /* Test for forward reference or uncompleted reference. This is disabled
2437 when called to scan a completed pattern by setting cd->start_workspace to
2438 NULL. */
2439
2440 if (cd->start_workspace != NULL)
2441 {
2442 const pcre_uchar *tcode;
2443 for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2444 if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2445 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2446 }
2447
2448 /* If the reference is to a completed group, we need to detect whether this
2449 is a recursive call, as otherwise there will be an infinite loop. If it is
2450 a recursion, just skip over it. Simple recursions are easily detected. For
2451 mutual recursions we keep a chain on the stack. */
2452
2453 do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2454 if (code >= scode && code <= endgroup) continue; /* Simple recursion */
2455 else
2456 {
2457 recurse_check *r = recurses;
2458 for (r = recurses; r != NULL; r = r->prev)
2459 if (r->group == scode) break;
2460 if (r != NULL) continue; /* Mutual recursion */
2461 }
2462
2463 /* Completed reference; scan the referenced group, remembering it on the
2464 stack chain to detect mutual recursions. */
2465
2466 empty_branch = FALSE;
2467 this_recurse.prev = recurses;
2468 this_recurse.group = scode;
2469
2470 do
2471 {
2472 if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2473 {
2474 empty_branch = TRUE;
2475 break;
2476 }
2477 scode += GET(scode, 1);
2478 }
2479 while (*scode == OP_ALT);
2480
2481 if (!empty_branch) return FALSE; /* All branches are non-empty */
2482 continue;
2483 }
2484
2485 /* Groups with zero repeats can of course be empty; skip them. */
2486
2487 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2488 c == OP_BRAPOSZERO)
2489 {
2490 code += PRIV(OP_lengths)[c];
2491 do code += GET(code, 1); while (*code == OP_ALT);
2492 c = *code;
2493 continue;
2494 }
2495
2496 /* A nested group that is already marked as "could be empty" can just be
2497 skipped. */
2498
2499 if (c == OP_SBRA || c == OP_SBRAPOS ||
2500 c == OP_SCBRA || c == OP_SCBRAPOS)
2501 {
2502 do code += GET(code, 1); while (*code == OP_ALT);
2503 c = *code;
2504 continue;
2505 }
2506
2507 /* For other groups, scan the branches. */
2508
2509 if (c == OP_BRA || c == OP_BRAPOS ||
2510 c == OP_CBRA || c == OP_CBRAPOS ||
2511 c == OP_ONCE || c == OP_ONCE_NC ||
2512 c == OP_COND || c == OP_SCOND)
2513 {
2514 BOOL empty_branch;
2515 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2516
2517 /* If a conditional group has only one branch, there is a second, implied,
2518 empty branch, so just skip over the conditional, because it could be empty.
2519 Otherwise, scan the individual branches of the group. */
2520
2521 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2522 code += GET(code, 1);
2523 else
2524 {
2525 empty_branch = FALSE;
2526 do
2527 {
2528 if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
2529 recurses)) empty_branch = TRUE;
2530 code += GET(code, 1);
2531 }
2532 while (*code == OP_ALT);
2533 if (!empty_branch) return FALSE; /* All branches are non-empty */
2534 }
2535
2536 c = *code;
2537 continue;
2538 }
2539
2540 /* Handle the other opcodes */
2541
2542 switch (c)
2543 {
2544 /* Check for quantifiers after a class. XCLASS is used for classes that
2545 cannot be represented just by a bit map. This includes negated single
2546 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2547 actual length is stored in the compiled code, so we must update "code"
2548 here. */
2549
2550#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2551 case OP_XCLASS:
2552 ccode = code += GET(code, 1);
2553 goto CHECK_CLASS_REPEAT;
2554#endif
2555
2556 case OP_CLASS:
2557 case OP_NCLASS:
2558 ccode = code + PRIV(OP_lengths)[OP_CLASS];
2559
2560#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2561 CHECK_CLASS_REPEAT:
2562#endif
2563
2564 switch (*ccode)
2565 {
2566 case OP_CRSTAR: /* These could be empty; continue */
2567 case OP_CRMINSTAR:
2568 case OP_CRQUERY:
2569 case OP_CRMINQUERY:
2570 case OP_CRPOSSTAR:
2571 case OP_CRPOSQUERY:
2572 break;
2573
2574 default: /* Non-repeat => class must match */
2575 case OP_CRPLUS: /* These repeats aren't empty */
2576 case OP_CRMINPLUS:
2577 case OP_CRPOSPLUS:
2578 return FALSE;
2579
2580 case OP_CRRANGE:
2581 case OP_CRMINRANGE:
2582 case OP_CRPOSRANGE:
2583 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2584 break;
2585 }
2586 break;
2587
2588 /* Opcodes that must match a character */
2589
2590 case OP_ANY:
2591 case OP_ALLANY:
2592 case OP_ANYBYTE:
2593
2594 case OP_PROP:
2595 case OP_NOTPROP:
2596 case OP_ANYNL:
2597
2598 case OP_NOT_HSPACE:
2599 case OP_HSPACE:
2600 case OP_NOT_VSPACE:
2601 case OP_VSPACE:
2602 case OP_EXTUNI:
2603
2604 case OP_NOT_DIGIT:
2605 case OP_DIGIT:
2606 case OP_NOT_WHITESPACE:
2607 case OP_WHITESPACE:
2608 case OP_NOT_WORDCHAR:
2609 case OP_WORDCHAR:
2610
2611 case OP_CHAR:
2612 case OP_CHARI:
2613 case OP_NOT:
2614 case OP_NOTI:
2615
2616 case OP_PLUS:
2617 case OP_PLUSI:
2618 case OP_MINPLUS:
2619 case OP_MINPLUSI:
2620
2621 case OP_NOTPLUS:
2622 case OP_NOTPLUSI:
2623 case OP_NOTMINPLUS:
2624 case OP_NOTMINPLUSI:
2625
2626 case OP_POSPLUS:
2627 case OP_POSPLUSI:
2628 case OP_NOTPOSPLUS:
2629 case OP_NOTPOSPLUSI:
2630
2631 case OP_EXACT:
2632 case OP_EXACTI:
2633 case OP_NOTEXACT:
2634 case OP_NOTEXACTI:
2635
2636 case OP_TYPEPLUS:
2637 case OP_TYPEMINPLUS:
2638 case OP_TYPEPOSPLUS:
2639 case OP_TYPEEXACT:
2640
2641 return FALSE;
2642
2643 /* These are going to continue, as they may be empty, but we have to
2644 fudge the length for the \p and \P cases. */
2645
2646 case OP_TYPESTAR:
2647 case OP_TYPEMINSTAR:
2648 case OP_TYPEPOSSTAR:
2649 case OP_TYPEQUERY:
2650 case OP_TYPEMINQUERY:
2651 case OP_TYPEPOSQUERY:
2652 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2653 break;
2654
2655 /* Same for these */
2656
2657 case OP_TYPEUPTO:
2658 case OP_TYPEMINUPTO:
2659 case OP_TYPEPOSUPTO:
2660 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2661 code += 2;
2662 break;
2663
2664 /* End of branch */
2665
2666 case OP_KET:
2667 case OP_KETRMAX:
2668 case OP_KETRMIN:
2669 case OP_KETRPOS:
2670 case OP_ALT:
2671 return TRUE;
2672
2673 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2674 MINUPTO, and POSUPTO and their caseless and negative versions may be
2675 followed by a multibyte character. */
2676
2677#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2678 case OP_STAR:
2679 case OP_STARI:
2680 case OP_NOTSTAR:
2681 case OP_NOTSTARI:
2682
2683 case OP_MINSTAR:
2684 case OP_MINSTARI:
2685 case OP_NOTMINSTAR:
2686 case OP_NOTMINSTARI:
2687
2688 case OP_POSSTAR:
2689 case OP_POSSTARI:
2690 case OP_NOTPOSSTAR:
2691 case OP_NOTPOSSTARI:
2692
2693 case OP_QUERY:
2694 case OP_QUERYI:
2695 case OP_NOTQUERY:
2696 case OP_NOTQUERYI:
2697
2698 case OP_MINQUERY:
2699 case OP_MINQUERYI:
2700 case OP_NOTMINQUERY:
2701 case OP_NOTMINQUERYI:
2702
2703 case OP_POSQUERY:
2704 case OP_POSQUERYI:
2705 case OP_NOTPOSQUERY:
2706 case OP_NOTPOSQUERYI:
2707
2708 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2709 break;
2710
2711 case OP_UPTO:
2712 case OP_UPTOI:
2713 case OP_NOTUPTO:
2714 case OP_NOTUPTOI:
2715
2716 case OP_MINUPTO:
2717 case OP_MINUPTOI:
2718 case OP_NOTMINUPTO:
2719 case OP_NOTMINUPTOI:
2720
2721 case OP_POSUPTO:
2722 case OP_POSUPTOI:
2723 case OP_NOTPOSUPTO:
2724 case OP_NOTPOSUPTOI:
2725
2726 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2727 break;
2728#endif
2729
2730 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2731 string. */
2732
2733 case OP_MARK:
2734 case OP_PRUNE_ARG:
2735 case OP_SKIP_ARG:
2736 case OP_THEN_ARG:
2737 code += code[1];
2738 break;
2739
2740 /* None of the remaining opcodes are required to match a character. */
2741
2742 default:
2743 break;
2744 }
2745 }
2746
2747return TRUE;
2748}
2749
2750
2751
2752/*************************************************
2753* Scan compiled regex for non-emptiness *
2754*************************************************/
2755
2756/* This function is called to check for left recursive calls. We want to check
2757the current branch of the current pattern to see if it could match the empty
2758string. If it could, we must look outwards for branches at other levels,
2759stopping when we pass beyond the bracket which is the subject of the recursion.
2760This function is called only during the real compile, not during the
2761pre-compile.
2762
2763Arguments:
2764 code points to start of the recursion
2765 endcode points to where to stop (current RECURSE item)
2766 bcptr points to the chain of current (unclosed) branch starts
2767 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2768 cd pointers to tables etc
2769
2770Returns: TRUE if what is matched could be empty
2771*/
2772
2773static BOOL
2774could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2775 branch_chain *bcptr, BOOL utf, compile_data *cd)
2776{
2777while (bcptr != NULL && bcptr->current_branch >= code)
2778 {
2779 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2780 return FALSE;
2781 bcptr = bcptr->outer;
2782 }
2783return TRUE;
2784}
2785
2786
2787
2788/*************************************************
2789* Base opcode of repeated opcodes *
2790*************************************************/
2791
2792/* Returns the base opcode for repeated single character type opcodes. If the
2793opcode is not a repeated character type, it returns with the original value.
2794
2795Arguments: c opcode
2796Returns: base opcode for the type
2797*/
2798
2799static pcre_uchar
2800get_repeat_base(pcre_uchar c)
2801{
2802return (c > OP_TYPEPOSUPTO)? c :
2803 (c >= OP_TYPESTAR)? OP_TYPESTAR :
2804 (c >= OP_NOTSTARI)? OP_NOTSTARI :
2805 (c >= OP_NOTSTAR)? OP_NOTSTAR :
2806 (c >= OP_STARI)? OP_STARI :
2807 OP_STAR;
2808}
2809
2810
2811
2812#ifdef SUPPORT_UCP
2813/*************************************************
2814* Check a character and a property *
2815*************************************************/
2816
2817/* This function is called by check_auto_possessive() when a property item
2818is adjacent to a fixed character.
2819
2820Arguments:
2821 c the character
2822 ptype the property type
2823 pdata the data for the type
2824 negated TRUE if it's a negated property (\P or \p{^)
2825
2826Returns: TRUE if auto-possessifying is OK
2827*/
2828
2829static BOOL
2830check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2831 BOOL negated)
2832{
2833const pcre_uint32 *p;
2834const ucd_record *prop = GET_UCD(c);
2835
2836switch(ptype)
2837 {
2838 case PT_LAMP:
2839 return (prop->chartype == ucp_Lu ||
2840 prop->chartype == ucp_Ll ||
2841 prop->chartype == ucp_Lt) == negated;
2842
2843 case PT_GC:
2844 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2845
2846 case PT_PC:
2847 return (pdata == prop->chartype) == negated;
2848
2849 case PT_SC:
2850 return (pdata == prop->script) == negated;
2851
2852 /* These are specials */
2853
2854 case PT_ALNUM:
2855 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2856 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2857
2858 /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2859 means that Perl space and POSIX space are now identical. PCRE was changed
2860 at release 8.34. */
2861
2862 case PT_SPACE: /* Perl space */
2863 case PT_PXSPACE: /* POSIX space */
2864 switch(c)
2865 {
2866 HSPACE_CASES:
2867 VSPACE_CASES:
2868 return negated;
2869
2870 default:
2871 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2872 }
2873 break; /* Control never reaches here */
2874
2875 case PT_WORD:
2876 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2877 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2878 c == CHAR_UNDERSCORE) == negated;
2879
2880 case PT_CLIST:
2881 p = PRIV(ucd_caseless_sets) + prop->caseset;
2882 for (;;)
2883 {
2884 if (c < *p) return !negated;
2885 if (c == *p++) return negated;
2886 }
2887 break; /* Control never reaches here */
2888 }
2889
2890return FALSE;
2891}
2892#endif /* SUPPORT_UCP */
2893
2894
2895
2896/*************************************************
2897* Fill the character property list *
2898*************************************************/
2899
2900/* Checks whether the code points to an opcode that can take part in auto-
2901possessification, and if so, fills a list with its properties.
2902
2903Arguments:
2904 code points to start of expression
2905 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2906 fcc points to case-flipping table
2907 list points to output list
2908 list[0] will be filled with the opcode
2909 list[1] will be non-zero if this opcode
2910 can match an empty character string
2911 list[2..7] depends on the opcode
2912
2913Returns: points to the start of the next opcode if *code is accepted
2914 NULL if *code is not accepted
2915*/
2916
2917static const pcre_uchar *
2918get_chr_property_list(const pcre_uchar *code, BOOL utf,
2919 const pcre_uint8 *fcc, pcre_uint32 *list)
2920{
2921pcre_uchar c = *code;
2922pcre_uchar base;
2923const pcre_uchar *end;
2924pcre_uint32 chr;
2925
2926#ifdef SUPPORT_UCP
2927pcre_uint32 *clist_dest;
2928const pcre_uint32 *clist_src;
2929#else
2930utf = utf; /* Suppress "unused parameter" compiler warning */
2931#endif
2932
2933list[0] = c;
2934list[1] = FALSE;
2935code++;
2936
2937if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2938 {
2939 base = get_repeat_base(c);
2940 c -= (base - OP_STAR);
2941
2942 if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2943 code += IMM2_SIZE;
2944
2945 list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2946
2947 switch(base)
2948 {
2949 case OP_STAR:
2950 list[0] = OP_CHAR;
2951 break;
2952
2953 case OP_STARI:
2954 list[0] = OP_CHARI;
2955 break;
2956
2957 case OP_NOTSTAR:
2958 list[0] = OP_NOT;
2959 break;
2960
2961 case OP_NOTSTARI:
2962 list[0] = OP_NOTI;
2963 break;
2964
2965 case OP_TYPESTAR:
2966 list[0] = *code;
2967 code++;
2968 break;
2969 }
2970 c = list[0];
2971 }
2972
2973switch(c)
2974 {
2975 case OP_NOT_DIGIT:
2976 case OP_DIGIT:
2977 case OP_NOT_WHITESPACE:
2978 case OP_WHITESPACE:
2979 case OP_NOT_WORDCHAR:
2980 case OP_WORDCHAR:
2981 case OP_ANY:
2982 case OP_ALLANY:
2983 case OP_ANYNL:
2984 case OP_NOT_HSPACE:
2985 case OP_HSPACE:
2986 case OP_NOT_VSPACE:
2987 case OP_VSPACE:
2988 case OP_EXTUNI:
2989 case OP_EODN:
2990 case OP_EOD:
2991 case OP_DOLL:
2992 case OP_DOLLM:
2993 return code;
2994
2995 case OP_CHAR:
2996 case OP_NOT:
2997 GETCHARINCTEST(chr, code);
2998 list[2] = chr;
2999 list[3] = NOTACHAR;
3000 return code;
3001
3002 case OP_CHARI:
3003 case OP_NOTI:
3004 list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
3005 GETCHARINCTEST(chr, code);
3006 list[2] = chr;
3007
3008#ifdef SUPPORT_UCP
3009 if (chr < 128 || (chr < 256 && !utf))
3010 list[3] = fcc[chr];
3011 else
3012 list[3] = UCD_OTHERCASE(chr);
3013#elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
3014 list[3] = (chr < 256) ? fcc[chr] : chr;
3015#else
3016 list[3] = fcc[chr];
3017#endif
3018
3019 /* The othercase might be the same value. */
3020
3021 if (chr == list[3])
3022 list[3] = NOTACHAR;
3023 else
3024 list[4] = NOTACHAR;
3025 return code;
3026
3027#ifdef SUPPORT_UCP
3028 case OP_PROP:
3029 case OP_NOTPROP:
3030 if (code[0] != PT_CLIST)
3031 {
3032 list[2] = code[0];
3033 list[3] = code[1];
3034 return code + 2;
3035 }
3036
3037 /* Convert only if we have enough space. */
3038
3039 clist_src = PRIV(ucd_caseless_sets) + code[1];
3040 clist_dest = list + 2;
3041 code += 2;
3042
3043 do {
3044 if (clist_dest >= list + 8)
3045 {
3046 /* Early return if there is not enough space. This should never
3047 happen, since all clists are shorter than 5 character now. */
3048 list[2] = code[0];
3049 list[3] = code[1];
3050 return code;
3051 }
3052 *clist_dest++ = *clist_src;
3053 }
3054 while(*clist_src++ != NOTACHAR);
3055
3056 /* All characters are stored. The terminating NOTACHAR
3057 is copied form the clist itself. */
3058
3059 list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3060 return code;
3061#endif
3062
3063 case OP_NCLASS:
3064 case OP_CLASS:
3065#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3066 case OP_XCLASS:
3067 if (c == OP_XCLASS)
3068 end = code + GET(code, 0) - 1;
3069 else
3070#endif
3071 end = code + 32 / sizeof(pcre_uchar);
3072
3073 switch(*end)
3074 {
3075 case OP_CRSTAR:
3076 case OP_CRMINSTAR:
3077 case OP_CRQUERY:
3078 case OP_CRMINQUERY:
3079 case OP_CRPOSSTAR:
3080 case OP_CRPOSQUERY:
3081 list[1] = TRUE;
3082 end++;
3083 break;
3084
3085 case OP_CRPLUS:
3086 case OP_CRMINPLUS:
3087 case OP_CRPOSPLUS:
3088 end++;
3089 break;
3090
3091 case OP_CRRANGE:
3092 case OP_CRMINRANGE:
3093 case OP_CRPOSRANGE:
3094 list[1] = (GET2(end, 1) == 0);
3095 end += 1 + 2 * IMM2_SIZE;
3096 break;
3097 }
3098 list[2] = (pcre_uint32)(end - code);
3099 return end;
3100 }
3101return NULL; /* Opcode not accepted */
3102}
3103
3104
3105
3106/*************************************************
3107* Scan further character sets for match *
3108*************************************************/
3109
3110/* Checks whether the base and the current opcode have a common character, in
3111which case the base cannot be possessified.
3112
3113Arguments:
3114 code points to the byte code
3115 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3116 cd static compile data
3117 base_list the data list of the base opcode
3118
3119Returns: TRUE if the auto-possessification is possible
3120*/
3121
3122static BOOL
3123compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3124 const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
3125{
3126pcre_uchar c;
3127pcre_uint32 list[8];
3128const pcre_uint32 *chr_ptr;
3129const pcre_uint32 *ochr_ptr;
3130const pcre_uint32 *list_ptr;
3131const pcre_uchar *next_code;
3132#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3133const pcre_uchar *xclass_flags;
3134#endif
3135const pcre_uint8 *class_bitset;
3136const pcre_uint8 *set1, *set2, *set_end;
3137pcre_uint32 chr;
3138BOOL accepted, invert_bits;
3139BOOL entered_a_group = FALSE;
3140
3141if (*rec_limit == 0) return FALSE;
3142--(*rec_limit);
3143
3144/* Note: the base_list[1] contains whether the current opcode has greedy
3145(represented by a non-zero value) quantifier. This is a different from
3146other character type lists, which stores here that the character iterator
3147matches to an empty string (also represented by a non-zero value). */
3148
3149for(;;)
3150 {
3151 /* All operations move the code pointer forward.
3152 Therefore infinite recursions are not possible. */
3153
3154 c = *code;
3155
3156 /* Skip over callouts */
3157
3158 if (c == OP_CALLOUT)
3159 {
3160 code += PRIV(OP_lengths)[c];
3161 continue;
3162 }
3163
3164 if (c == OP_ALT)
3165 {
3166 do code += GET(code, 1); while (*code == OP_ALT);
3167 c = *code;
3168 }
3169
3170 switch(c)
3171 {
3172 case OP_END:
3173 case OP_KETRPOS:
3174 /* TRUE only in greedy case. The non-greedy case could be replaced by
3175 an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3176 uses more memory, which we cannot get at this stage.) */
3177
3178 return base_list[1] != 0;
3179
3180 case OP_KET:
3181 /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3182 it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3183 cannot be converted to a possessive form. */
3184
3185 if (base_list[1] == 0) return FALSE;
3186
3187 switch(*(code - GET(code, 1)))
3188 {
3189 case OP_ASSERT:
3190 case OP_ASSERT_NOT:
3191 case OP_ASSERTBACK:
3192 case OP_ASSERTBACK_NOT:
3193 case OP_ONCE:
3194 case OP_ONCE_NC:
3195 /* Atomic sub-patterns and assertions can always auto-possessify their
3196 last iterator. However, if the group was entered as a result of checking
3197 a previous iterator, this is not possible. */
3198
3199 return !entered_a_group;
3200 }
3201
3202 code += PRIV(OP_lengths)[c];
3203 continue;
3204
3205 case OP_ONCE:
3206 case OP_ONCE_NC:
3207 case OP_BRA:
3208 case OP_CBRA:
3209 next_code = code + GET(code, 1);
3210 code += PRIV(OP_lengths)[c];
3211
3212 while (*next_code == OP_ALT)
3213 {
3214 if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
3215 return FALSE;
3216 code = next_code + 1 + LINK_SIZE;
3217 next_code += GET(next_code, 1);
3218 }
3219
3220 entered_a_group = TRUE;
3221 continue;
3222
3223 case OP_BRAZERO:
3224 case OP_BRAMINZERO:
3225
3226 next_code = code + 1;
3227 if (*next_code != OP_BRA && *next_code != OP_CBRA
3228 && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3229
3230 do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3231
3232 /* The bracket content will be checked by the
3233 OP_BRA/OP_CBRA case above. */
3234 next_code += 1 + LINK_SIZE;
3235 if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
3236 return FALSE;
3237
3238 code += PRIV(OP_lengths)[c];
3239 continue;
3240
3241 default:
3242 break;
3243 }
3244
3245 /* Check for a supported opcode, and load its properties. */
3246
3247 code = get_chr_property_list(code, utf, cd->fcc, list);
3248 if (code == NULL) return FALSE; /* Unsupported */
3249
3250 /* If either opcode is a small character list, set pointers for comparing
3251 characters from that list with another list, or with a property. */
3252
3253 if (base_list[0] == OP_CHAR)
3254 {
3255 chr_ptr = base_list + 2;
3256 list_ptr = list;
3257 }
3258 else if (list[0] == OP_CHAR)
3259 {
3260 chr_ptr = list + 2;
3261 list_ptr = base_list;
3262 }
3263
3264 /* Character bitsets can also be compared to certain opcodes. */
3265
3266 else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3267#ifdef COMPILE_PCRE8
3268 /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3269 || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3270#endif
3271 )
3272 {
3273#ifdef COMPILE_PCRE8
3274 if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3275#else
3276 if (base_list[0] == OP_CLASS)
3277#endif
3278 {
3279 set1 = (pcre_uint8 *)(base_end - base_list[2]);
3280 list_ptr = list;
3281 }
3282 else
3283 {
3284 set1 = (pcre_uint8 *)(code - list[2]);
3285 list_ptr = base_list;
3286 }
3287
3288 invert_bits = FALSE;
3289 switch(list_ptr[0])
3290 {
3291 case OP_CLASS:
3292 case OP_NCLASS:
3293 set2 = (pcre_uint8 *)
3294 ((list_ptr == list ? code : base_end) - list_ptr[2]);
3295 break;
3296
3297#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3298 case OP_XCLASS:
3299 xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
3300 if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
3301 if ((*xclass_flags & XCL_MAP) == 0)
3302 {
3303 /* No bits are set for characters < 256. */
3304 if (list[1] == 0) return TRUE;
3305 /* Might be an empty repeat. */
3306 continue;
3307 }
3308 set2 = (pcre_uint8 *)(xclass_flags + 1);
3309 break;
3310#endif
3311
3312 case OP_NOT_DIGIT:
3313 invert_bits = TRUE;
3314 /* Fall through */
3315 case OP_DIGIT:
3316 set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3317 break;
3318
3319 case OP_NOT_WHITESPACE:
3320 invert_bits = TRUE;
3321 /* Fall through */
3322 case OP_WHITESPACE:
3323 set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3324 break;
3325
3326 case OP_NOT_WORDCHAR:
3327 invert_bits = TRUE;
3328 /* Fall through */
3329 case OP_WORDCHAR:
3330 set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3331 break;
3332
3333 default:
3334 return FALSE;
3335 }
3336
3337 /* Because the sets are unaligned, we need
3338 to perform byte comparison here. */
3339 set_end = set1 + 32;
3340 if (invert_bits)
3341 {
3342 do
3343 {
3344 if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3345 }
3346 while (set1 < set_end);
3347 }
3348 else
3349 {
3350 do
3351 {
3352 if ((*set1++ & *set2++) != 0) return FALSE;
3353 }
3354 while (set1 < set_end);
3355 }
3356
3357 if (list[1] == 0) return TRUE;
3358 /* Might be an empty repeat. */
3359 continue;
3360 }
3361
3362 /* Some property combinations also acceptable. Unicode property opcodes are
3363 processed specially; the rest can be handled with a lookup table. */
3364
3365 else
3366 {
3367 pcre_uint32 leftop, rightop;
3368
3369 leftop = base_list[0];
3370 rightop = list[0];
3371
3372#ifdef SUPPORT_UCP
3373 accepted = FALSE; /* Always set in non-unicode case. */
3374 if (leftop == OP_PROP || leftop == OP_NOTPROP)
3375 {
3376 if (rightop == OP_EOD)
3377 accepted = TRUE;
3378 else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3379 {
3380 int n;
3381 const pcre_uint8 *p;
3382 BOOL same = leftop == rightop;
3383 BOOL lisprop = leftop == OP_PROP;
3384 BOOL risprop = rightop == OP_PROP;
3385 BOOL bothprop = lisprop && risprop;
3386
3387 /* There's a table that specifies how each combination is to be
3388 processed:
3389 0 Always return FALSE (never auto-possessify)
3390 1 Character groups are distinct (possessify if both are OP_PROP)
3391 2 Check character categories in the same group (general or particular)
3392 3 Return TRUE if the two opcodes are not the same
3393 ... see comments below
3394 */
3395
3396 n = propposstab[base_list[2]][list[2]];
3397 switch(n)
3398 {
3399 case 0: break;
3400 case 1: accepted = bothprop; break;
3401 case 2: accepted = (base_list[3] == list[3]) != same; break;
3402 case 3: accepted = !same; break;
3403
3404 case 4: /* Left general category, right particular category */
3405 accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3406 break;
3407
3408 case 5: /* Right general category, left particular category */
3409 accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3410 break;
3411
3412 /* This code is logically tricky. Think hard before fiddling with it.
3413 The posspropstab table has four entries per row. Each row relates to
3414 one of PCRE's special properties such as ALNUM or SPACE or WORD.
3415 Only WORD actually needs all four entries, but using repeats for the
3416 others means they can all use the same code below.
3417
3418 The first two entries in each row are Unicode general categories, and
3419 apply always, because all the characters they include are part of the
3420 PCRE character set. The third and fourth entries are a general and a
3421 particular category, respectively, that include one or more relevant
3422 characters. One or the other is used, depending on whether the check
3423 is for a general or a particular category. However, in both cases the
3424 category contains more characters than the specials that are defined
3425 for the property being tested against. Therefore, it cannot be used
3426 in a NOTPROP case.
3427
3428 Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3429 Underscore is covered by ucp_P or ucp_Po. */
3430
3431 case 6: /* Left alphanum vs right general category */
3432 case 7: /* Left space vs right general category */
3433 case 8: /* Left word vs right general category */
3434 p = posspropstab[n-6];
3435 accepted = risprop && lisprop ==
3436 (list[3] != p[0] &&
3437 list[3] != p[1] &&
3438 (list[3] != p[2] || !lisprop));
3439 break;
3440
3441 case 9: /* Right alphanum vs left general category */
3442 case 10: /* Right space vs left general category */
3443 case 11: /* Right word vs left general category */
3444 p = posspropstab[n-9];
3445 accepted = lisprop && risprop ==
3446 (base_list[3] != p[0] &&
3447 base_list[3] != p[1] &&
3448 (base_list[3] != p[2] || !risprop));
3449 break;
3450
3451 case 12: /* Left alphanum vs right particular category */
3452 case 13: /* Left space vs right particular category */
3453 case 14: /* Left word vs right particular category */
3454 p = posspropstab[n-12];
3455 accepted = risprop && lisprop ==
3456 (catposstab[p[0]][list[3]] &&
3457 catposstab[p[1]][list[3]] &&
3458 (list[3] != p[3] || !lisprop));
3459 break;
3460
3461 case 15: /* Right alphanum vs left particular category */
3462 case 16: /* Right space vs left particular category */
3463 case 17: /* Right word vs left particular category */
3464 p = posspropstab[n-15];
3465 accepted = lisprop && risprop ==
3466 (catposstab[p[0]][base_list[3]] &&
3467 catposstab[p[1]][base_list[3]] &&
3468 (base_list[3] != p[3] || !risprop));
3469 break;
3470 }
3471 }
3472 }
3473
3474 else
3475#endif /* SUPPORT_UCP */
3476
3477 accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3478 rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3479 autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3480
3481 if (!accepted) return FALSE;
3482
3483 if (list[1] == 0) return TRUE;
3484 /* Might be an empty repeat. */
3485 continue;
3486 }
3487
3488 /* Control reaches here only if one of the items is a small character list.
3489 All characters are checked against the other side. */
3490
3491 do
3492 {
3493 chr = *chr_ptr;
3494
3495 switch(list_ptr[0])
3496 {
3497 case OP_CHAR:
3498 ochr_ptr = list_ptr + 2;
3499 do
3500 {
3501 if (chr == *ochr_ptr) return FALSE;
3502 ochr_ptr++;
3503 }
3504 while(*ochr_ptr != NOTACHAR);
3505 break;
3506
3507 case OP_NOT:
3508 ochr_ptr = list_ptr + 2;
3509 do
3510 {
3511 if (chr == *ochr_ptr)
3512 break;
3513 ochr_ptr++;
3514 }
3515 while(*ochr_ptr != NOTACHAR);
3516 if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */
3517 break;
3518
3519 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3520 set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3521
3522 case OP_DIGIT:
3523 if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3524 break;
3525
3526 case OP_NOT_DIGIT:
3527 if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3528 break;
3529
3530 case OP_WHITESPACE:
3531 if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3532 break;
3533
3534 case OP_NOT_WHITESPACE:
3535 if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3536 break;
3537
3538 case OP_WORDCHAR:
3539 if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3540 break;
3541
3542 case OP_NOT_WORDCHAR:
3543 if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3544 break;
3545
3546 case OP_HSPACE:
3547 switch(chr)
3548 {
3549 HSPACE_CASES: return FALSE;
3550 default: break;
3551 }
3552 break;
3553
3554 case OP_NOT_HSPACE:
3555 switch(chr)
3556 {
3557 HSPACE_CASES: break;
3558 default: return FALSE;
3559 }
3560 break;
3561
3562 case OP_ANYNL:
3563 case OP_VSPACE:
3564 switch(chr)
3565 {
3566 VSPACE_CASES: return FALSE;
3567 default: break;
3568 }
3569 break;
3570
3571 case OP_NOT_VSPACE:
3572 switch(chr)
3573 {
3574 VSPACE_CASES: break;
3575 default: return FALSE;
3576 }
3577 break;
3578
3579 case OP_DOLL:
3580 case OP_EODN:
3581 switch (chr)
3582 {
3583 case CHAR_CR:
3584 case CHAR_LF:
3585 case CHAR_VT:
3586 case CHAR_FF:
3587 case CHAR_NEL:
3588#ifndef EBCDIC
3589 case 0x2028:
3590 case 0x2029:
3591#endif /* Not EBCDIC */
3592 return FALSE;
3593 }
3594 break;
3595
3596 case OP_EOD: /* Can always possessify before \z */
3597 break;
3598
3599#ifdef SUPPORT_UCP
3600 case OP_PROP:
3601 case OP_NOTPROP:
3602 if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3603 list_ptr[0] == OP_NOTPROP))
3604 return FALSE;
3605 break;
3606#endif
3607
3608 case OP_NCLASS:
3609 if (chr > 255) return FALSE;
3610 /* Fall through */
3611
3612 case OP_CLASS:
3613 if (chr > 255) break;
3614 class_bitset = (pcre_uint8 *)
3615 ((list_ptr == list ? code : base_end) - list_ptr[2]);
3616 if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3617 break;
3618
3619#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3620 case OP_XCLASS:
3621 if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3622 list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3623 break;
3624#endif
3625
3626 default:
3627 return FALSE;
3628 }
3629
3630 chr_ptr++;
3631 }
3632 while(*chr_ptr != NOTACHAR);
3633
3634 /* At least one character must be matched from this opcode. */
3635
3636 if (list[1] == 0) return TRUE;
3637 }
3638
3639/* Control never reaches here. There used to be a fail-save return FALSE; here,
3640but some compilers complain about an unreachable statement. */
3641
3642}
3643
3644
3645
3646/*************************************************
3647* Scan compiled regex for auto-possession *
3648*************************************************/
3649
3650/* Replaces single character iterations with their possessive alternatives
3651if appropriate. This function modifies the compiled opcode!
3652
3653Arguments:
3654 code points to start of the byte code
3655 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3656 cd static compile data
3657
3658Returns: nothing
3659*/
3660
3661static void
3662auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3663{
3664register pcre_uchar c;
3665const pcre_uchar *end;
3666pcre_uchar *repeat_opcode;
3667pcre_uint32 list[8];
3668int rec_limit;
3669
3670for (;;)
3671 {
3672 c = *code;
3673
3674 /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
3675 it may compile without complaining, but may get into a loop here if the code
3676 pointer points to a bad value. This is, of course a documentated possibility,
3677 when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
3678 just give up on this optimization. */
3679
3680 if (c >= OP_TABLE_LENGTH) return;
3681
3682 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3683 {
3684 c -= get_repeat_base(c) - OP_STAR;
3685 end = (c <= OP_MINUPTO) ?
3686 get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3687 list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3688
3689 rec_limit = 1000;
3690 if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
3691 {
3692 switch(c)
3693 {
3694 case OP_STAR:
3695 *code += OP_POSSTAR - OP_STAR;
3696 break;
3697
3698 case OP_MINSTAR:
3699 *code += OP_POSSTAR - OP_MINSTAR;
3700 break;
3701
3702 case OP_PLUS:
3703 *code += OP_POSPLUS - OP_PLUS;
3704 break;
3705
3706 case OP_MINPLUS:
3707 *code += OP_POSPLUS - OP_MINPLUS;
3708 break;
3709
3710 case OP_QUERY:
3711 *code += OP_POSQUERY - OP_QUERY;
3712 break;
3713
3714 case OP_MINQUERY:
3715 *code += OP_POSQUERY - OP_MINQUERY;
3716 break;
3717
3718 case OP_UPTO:
3719 *code += OP_POSUPTO - OP_UPTO;
3720 break;
3721
3722 case OP_MINUPTO:
3723 *code += OP_POSUPTO - OP_MINUPTO;
3724 break;
3725 }
3726 }
3727 c = *code;
3728 }
3729 else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3730 {
3731#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3732 if (c == OP_XCLASS)
3733 repeat_opcode = code + GET(code, 1);
3734 else
3735#endif
3736 repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3737
3738 c = *repeat_opcode;
3739 if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3740 {
3741 /* end must not be NULL. */
3742 end = get_chr_property_list(code, utf, cd->fcc, list);
3743
3744 list[1] = (c & 1) == 0;
3745
3746 rec_limit = 1000;
3747 if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
3748 {
3749 switch (c)
3750 {
3751 case OP_CRSTAR:
3752 case OP_CRMINSTAR:
3753 *repeat_opcode = OP_CRPOSSTAR;
3754 break;
3755
3756 case OP_CRPLUS:
3757 case OP_CRMINPLUS:
3758 *repeat_opcode = OP_CRPOSPLUS;
3759 break;
3760
3761 case OP_CRQUERY:
3762 case OP_CRMINQUERY:
3763 *repeat_opcode = OP_CRPOSQUERY;
3764 break;
3765
3766 case OP_CRRANGE:
3767 case OP_CRMINRANGE:
3768 *repeat_opcode = OP_CRPOSRANGE;
3769 break;
3770 }
3771 }
3772 }
3773 c = *code;
3774 }
3775
3776 switch(c)
3777 {
3778 case OP_END:
3779 return;
3780
3781 case OP_TYPESTAR:
3782 case OP_TYPEMINSTAR:
3783 case OP_TYPEPLUS:
3784 case OP_TYPEMINPLUS:
3785 case OP_TYPEQUERY:
3786 case OP_TYPEMINQUERY:
3787 case OP_TYPEPOSSTAR:
3788 case OP_TYPEPOSPLUS:
3789 case OP_TYPEPOSQUERY:
3790 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3791 break;
3792
3793 case OP_TYPEUPTO:
3794 case OP_TYPEMINUPTO:
3795 case OP_TYPEEXACT:
3796 case OP_TYPEPOSUPTO:
3797 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3798 code += 2;
3799 break;
3800
3801#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3802 case OP_XCLASS:
3803 code += GET(code, 1);
3804 break;
3805#endif
3806
3807 case OP_MARK:
3808 case OP_PRUNE_ARG:
3809 case OP_SKIP_ARG:
3810 case OP_THEN_ARG:
3811 code += code[1];
3812 break;
3813 }
3814
3815 /* Add in the fixed length from the table */
3816
3817 code += PRIV(OP_lengths)[c];
3818
3819 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3820 a multi-byte character. The length in the table is a minimum, so we have to
3821 arrange to skip the extra bytes. */
3822
3823#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3824 if (utf) switch(c)
3825 {
3826 case OP_CHAR:
3827 case OP_CHARI:
3828 case OP_NOT:
3829 case OP_NOTI:
3830 case OP_STAR:
3831 case OP_MINSTAR:
3832 case OP_PLUS:
3833 case OP_MINPLUS:
3834 case OP_QUERY:
3835 case OP_MINQUERY:
3836 case OP_UPTO:
3837 case OP_MINUPTO:
3838 case OP_EXACT:
3839 case OP_POSSTAR:
3840 case OP_POSPLUS:
3841 case OP_POSQUERY:
3842 case OP_POSUPTO:
3843 case OP_STARI:
3844 case OP_MINSTARI:
3845 case OP_PLUSI:
3846 case OP_MINPLUSI:
3847 case OP_QUERYI:
3848 case OP_MINQUERYI:
3849 case OP_UPTOI:
3850 case OP_MINUPTOI:
3851 case OP_EXACTI:
3852 case OP_POSSTARI:
3853 case OP_POSPLUSI:
3854 case OP_POSQUERYI:
3855 case OP_POSUPTOI:
3856 case OP_NOTSTAR:
3857 case OP_NOTMINSTAR:
3858 case OP_NOTPLUS:
3859 case OP_NOTMINPLUS:
3860 case OP_NOTQUERY:
3861 case OP_NOTMINQUERY:
3862 case OP_NOTUPTO:
3863 case OP_NOTMINUPTO:
3864 case OP_NOTEXACT:
3865 case OP_NOTPOSSTAR:
3866 case OP_NOTPOSPLUS:
3867 case OP_NOTPOSQUERY:
3868 case OP_NOTPOSUPTO:
3869 case OP_NOTSTARI:
3870 case OP_NOTMINSTARI:
3871 case OP_NOTPLUSI:
3872 case OP_NOTMINPLUSI:
3873 case OP_NOTQUERYI:
3874 case OP_NOTMINQUERYI:
3875 case OP_NOTUPTOI:
3876 case OP_NOTMINUPTOI:
3877 case OP_NOTEXACTI:
3878 case OP_NOTPOSSTARI:
3879 case OP_NOTPOSPLUSI:
3880 case OP_NOTPOSQUERYI:
3881 case OP_NOTPOSUPTOI:
3882 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3883 break;
3884 }
3885#else
3886 (void)(utf); /* Keep compiler happy by referencing function argument */
3887#endif
3888 }
3889}
3890
3891
3892
3893/*************************************************
3894* Check for POSIX class syntax *
3895*************************************************/
3896
3897/* This function is called when the sequence "[:" or "[." or "[=" is
3898encountered in a character class. It checks whether this is followed by a
3899sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3900reach an unescaped ']' without the special preceding character, return FALSE.
3901
3902Originally, this function only recognized a sequence of letters between the
3903terminators, but it seems that Perl recognizes any sequence of characters,
3904though of course unknown POSIX names are subsequently rejected. Perl gives an
3905"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3906didn't consider this to be a POSIX class. Likewise for [:1234:].
3907
3908The problem in trying to be exactly like Perl is in the handling of escapes. We
3909have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3910class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3911below handles the special cases \\ and \], but does not try to do any other
3912escape processing. This makes it different from Perl for cases such as
3913[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
3914not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
3915when Perl does, I think.
3916
3917A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3918It seems that the appearance of a nested POSIX class supersedes an apparent
3919external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3920a digit.
3921
3922In Perl, unescaped square brackets may also appear as part of class names. For
3923example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3924[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3925seem right at all. PCRE does not allow closing square brackets in POSIX class
3926names.
3927
3928Arguments:
3929 ptr pointer to the initial [
3930 endptr where to return the end pointer
3931
3932Returns: TRUE or FALSE
3933*/
3934
3935static BOOL
3936check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3937{
3938pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
3939terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
3940for (++ptr; *ptr != CHAR_NULL; ptr++)
3941 {
3942 if (*ptr == CHAR_BACKSLASH &&
3943 (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
3944 ptr[1] == CHAR_BACKSLASH))
3945 ptr++;
3946 else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
3947 *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3948 else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3949 {
3950 *endptr = ptr;
3951 return TRUE;
3952 }
3953 }
3954return FALSE;
3955}
3956
3957
3958
3959
3960/*************************************************
3961* Check POSIX class name *
3962*************************************************/
3963
3964/* This function is called to check the name given in a POSIX-style class entry
3965such as [:alnum:].
3966
3967Arguments:
3968 ptr points to the first letter
3969 len the length of the name
3970
3971Returns: a value representing the name, or -1 if unknown
3972*/
3973
3974static int
3975check_posix_name(const pcre_uchar *ptr, int len)
3976{
3977const char *pn = posix_names;
3978register int yield = 0;
3979while (posix_name_lengths[yield] != 0)
3980 {
3981 if (len == posix_name_lengths[yield] &&
3982 STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3983 pn += posix_name_lengths[yield] + 1;
3984 yield++;
3985 }
3986return -1;
3987}
3988
3989
3990/*************************************************
3991* Adjust OP_RECURSE items in repeated group *
3992*************************************************/
3993
3994/* OP_RECURSE items contain an offset from the start of the regex to the group
3995that is referenced. This means that groups can be replicated for fixed
3996repetition simply by copying (because the recursion is allowed to refer to
3997earlier groups that are outside the current group). However, when a group is
3998optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3999inserted before it, after it has been compiled. This means that any OP_RECURSE
4000items within it that refer to the group itself or any contained groups have to
4001have their offsets adjusted. That one of the jobs of this function. Before it
4002is called, the partially compiled regex must be temporarily terminated with
4003OP_END.
4004
4005This function has been extended to cope with forward references for recursions
4006and subroutine calls. It must check the list of such references for the
4007group we are dealing with. If it finds that one of the recursions in the
4008current group is on this list, it does not adjust the value in the reference
4009(which is a group number). After the group has been scanned, all the offsets in
4010the forward reference list for the group are adjusted.
4011
4012Arguments:
4013 group points to the start of the group
4014 adjust the amount by which the group is to be moved
4015 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
4016 cd contains pointers to tables etc.
4017 save_hwm_offset the hwm forward reference offset at the start of the group
4018
4019Returns: nothing
4020*/
4021
4022static void
4023adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
4024 size_t save_hwm_offset)
4025{
4026int offset;
4027pcre_uchar *hc;
4028pcre_uchar *ptr = group;
4029
4030while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
4031 {
4032 for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4033 hc += LINK_SIZE)
4034 {
4035 offset = (int)GET(hc, 0);
4036 if (cd->start_code + offset == ptr + 1) break;
4037 }
4038
4039 /* If we have not found this recursion on the forward reference list, adjust
4040 the recursion's offset if it's after the start of this group. */
4041
4042 if (hc >= cd->hwm)
4043 {
4044 offset = (int)GET(ptr, 1);
4045 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
4046 }
4047
4048 ptr += 1 + LINK_SIZE;
4049 }
4050
4051/* Now adjust all forward reference offsets for the group. */
4052
4053for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4054 hc += LINK_SIZE)
4055 {
4056 offset = (int)GET(hc, 0);
4057 PUT(hc, 0, offset + adjust);
4058 }
4059}
4060
4061
4062
4063/*************************************************
4064* Insert an automatic callout point *
4065*************************************************/
4066
4067/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
4068callout points before each pattern item.
4069
4070Arguments:
4071 code current code pointer
4072 ptr current pattern pointer
4073 cd pointers to tables etc
4074
4075Returns: new code pointer
4076*/
4077
4078static pcre_uchar *
4079auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
4080{
4081*code++ = OP_CALLOUT;
4082*code++ = 255;
4083PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
4084PUT(code, LINK_SIZE, 0); /* Default length */
4085return code + 2 * LINK_SIZE;
4086}
4087
4088
4089
4090/*************************************************
4091* Complete a callout item *
4092*************************************************/
4093
4094/* A callout item contains the length of the next item in the pattern, which
4095we can't fill in till after we have reached the relevant point. This is used
4096for both automatic and manual callouts.
4097
4098Arguments:
4099 previous_callout points to previous callout item
4100 ptr current pattern pointer
4101 cd pointers to tables etc
4102
4103Returns: nothing
4104*/
4105
4106static void
4107complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
4108{
4109int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
4110PUT(previous_callout, 2 + LINK_SIZE, length);
4111}
4112
4113
4114
4115#ifdef SUPPORT_UCP
4116/*************************************************
4117* Get othercase range *
4118*************************************************/
4119
4120/* This function is passed the start and end of a class range, in UTF-8 mode
4121with UCP support. It searches up the characters, looking for ranges of
4122characters in the "other" case. Each call returns the next one, updating the
4123start address. A character with multiple other cases is returned on its own
4124with a special return value.
4125
4126Arguments:
4127 cptr points to starting character value; updated
4128 d end value
4129 ocptr where to put start of othercase range
4130 odptr where to put end of othercase range
4131
4132Yield: -1 when no more
4133 0 when a range is returned
4134 >0 the CASESET offset for char with multiple other cases
4135 in this case, ocptr contains the original
4136*/
4137
4138static int
4139get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4140 pcre_uint32 *odptr)
4141{
4142pcre_uint32 c, othercase, next;
4143unsigned int co;
4144
4145/* Find the first character that has an other case. If it has multiple other
4146cases, return its case offset value. */
4147
4148for (c = *cptr; c <= d; c++)
4149 {
4150 if ((co = UCD_CASESET(c)) != 0)
4151 {
4152 *ocptr = c++; /* Character that has the set */
4153 *cptr = c; /* Rest of input range */
4154 return (int)co;
4155 }
4156 if ((othercase = UCD_OTHERCASE(c)) != c) break;
4157 }
4158
4159if (c > d) return -1; /* Reached end of range */
4160
4161/* Found a character that has a single other case. Search for the end of the
4162range, which is either the end of the input range, or a character that has zero
4163or more than one other cases. */
4164
4165*ocptr = othercase;
4166next = othercase + 1;
4167
4168for (++c; c <= d; c++)
4169 {
4170 if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4171 next++;
4172 }
4173
4174*odptr = next - 1; /* End of othercase range */
4175*cptr = c; /* Rest of input range */
4176return 0;
4177}
4178#endif /* SUPPORT_UCP */
4179
4180
4181
4182/*************************************************
4183* Add a character or range to a class *
4184*************************************************/
4185
4186/* This function packages up the logic of adding a character or range of
4187characters to a class. The character values in the arguments will be within the
4188valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4189mutually recursive with the function immediately below.
4190
4191Arguments:
4192 classbits the bit map for characters < 256
4193 uchardptr points to the pointer for extra data
4194 options the options word
4195 cd contains pointers to tables etc.
4196 start start of range character
4197 end end of range character
4198
4199Returns: the number of < 256 characters added
4200 the pointer to extra data is updated
4201*/
4202
4203static int
4204add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4205 compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4206{
4207pcre_uint32 c;
4208pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4209int n8 = 0;
4210
4211/* If caseless matching is required, scan the range and process alternate
4212cases. In Unicode, there are 8-bit characters that have alternate cases that
4213are greater than 255 and vice-versa. Sometimes we can just extend the original
4214range. */
4215
4216if ((options & PCRE_CASELESS) != 0)
4217 {
4218#ifdef SUPPORT_UCP
4219 if ((options & PCRE_UTF8) != 0)
4220 {
4221 int rc;
4222 pcre_uint32 oc, od;
4223
4224 options &= ~PCRE_CASELESS; /* Remove for recursive calls */
4225 c = start;
4226
4227 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4228 {
4229 /* Handle a single character that has more than one other case. */
4230
4231 if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4232 PRIV(ucd_caseless_sets) + rc, oc);
4233
4234 /* Do nothing if the other case range is within the original range. */
4235
4236 else if (oc >= start && od <= end) continue;
4237
4238 /* Extend the original range if there is overlap, noting that if oc < c, we
4239 can't have od > end because a subrange is always shorter than the basic
4240 range. Otherwise, use a recursive call to add the additional range. */
4241
4242 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4243 else if (od > end && oc <= end + 1)
4244 {
4245 end = od; /* Extend upwards */
4246 if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4247 }
4248 else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4249 }
4250 }
4251 else
4252#endif /* SUPPORT_UCP */
4253
4254 /* Not UTF-mode, or no UCP */
4255
4256 for (c = start; c <= classbits_end; c++)
4257 {
4258 SETBIT(classbits, cd->fcc[c]);
4259 n8++;
4260 }
4261 }
4262
4263/* Now handle the original range. Adjust the final value according to the bit
4264length - this means that the same lists of (e.g.) horizontal spaces can be used
4265in all cases. */
4266
4267#if defined COMPILE_PCRE8
4268#ifdef SUPPORT_UTF
4269 if ((options & PCRE_UTF8) == 0)
4270#endif
4271 if (end > 0xff) end = 0xff;
4272
4273#elif defined COMPILE_PCRE16
4274#ifdef SUPPORT_UTF
4275 if ((options & PCRE_UTF16) == 0)
4276#endif
4277 if (end > 0xffff) end = 0xffff;
4278
4279#endif /* COMPILE_PCRE[8|16] */
4280
4281/* Use the bitmap for characters < 256. Otherwise use extra data.*/
4282
4283for (c = start; c <= classbits_end; c++)
4284 {
4285 /* Regardless of start, c will always be <= 255. */
4286 SETBIT(classbits, c);
4287 n8++;
4288 }
4289
4290#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4291if (start <= 0xff) start = 0xff + 1;
4292
4293if (end >= start)
4294 {
4295 pcre_uchar *uchardata = *uchardptr;
4296#ifdef SUPPORT_UTF
4297 if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */
4298 {
4299 if (start < end)
4300 {
4301 *uchardata++ = XCL_RANGE;
4302 uchardata += PRIV(ord2utf)(start, uchardata);
4303 uchardata += PRIV(ord2utf)(end, uchardata);
4304 }
4305 else if (start == end)
4306 {
4307 *uchardata++ = XCL_SINGLE;
4308 uchardata += PRIV(ord2utf)(start, uchardata);
4309 }
4310 }
4311 else
4312#endif /* SUPPORT_UTF */
4313
4314 /* Without UTF support, character values are constrained by the bit length,
4315 and can only be > 256 for 16-bit and 32-bit libraries. */
4316
4317#ifdef COMPILE_PCRE8
4318 {}
4319#else
4320 if (start < end)
4321 {
4322 *uchardata++ = XCL_RANGE;
4323 *uchardata++ = start;
4324 *uchardata++ = end;
4325 }
4326 else if (start == end)
4327 {
4328 *uchardata++ = XCL_SINGLE;
4329 *uchardata++ = start;
4330 }
4331#endif
4332
4333 *uchardptr = uchardata; /* Updata extra data pointer */
4334 }
4335#endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4336
4337return n8; /* Number of 8-bit characters */
4338}
4339
4340
4341
4342
4343/*************************************************
4344* Add a list of characters to a class *
4345*************************************************/
4346
4347/* This function is used for adding a list of case-equivalent characters to a
4348class, and also for adding a list of horizontal or vertical whitespace. If the
4349list is in order (which it should be), ranges of characters are detected and
4350handled appropriately. This function is mutually recursive with the function
4351above.
4352
4353Arguments:
4354 classbits the bit map for characters < 256
4355 uchardptr points to the pointer for extra data
4356 options the options word
4357 cd contains pointers to tables etc.
4358 p points to row of 32-bit values, terminated by NOTACHAR
4359 except character to omit; this is used when adding lists of
4360 case-equivalent characters to avoid including the one we
4361 already know about
4362
4363Returns: the number of < 256 characters added
4364 the pointer to extra data is updated
4365*/
4366
4367static int
4368add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4369 compile_data *cd, const pcre_uint32 *p, unsigned int except)
4370{
4371int n8 = 0;
4372while (p[0] < NOTACHAR)
4373 {
4374 int n = 0;
4375 if (p[0] != except)
4376 {
4377 while(p[n+1] == p[0] + n + 1) n++;
4378 n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4379 }
4380 p += n + 1;
4381 }
4382return n8;
4383}
4384
4385
4386
4387/*************************************************
4388* Add characters not in a list to a class *
4389*************************************************/
4390
4391/* This function is used for adding the complement of a list of horizontal or
4392vertical whitespace to a class. The list must be in order.
4393
4394Arguments:
4395 classbits the bit map for characters < 256
4396 uchardptr points to the pointer for extra data
4397 options the options word
4398 cd contains pointers to tables etc.
4399 p points to row of 32-bit values, terminated by NOTACHAR
4400
4401Returns: the number of < 256 characters added
4402 the pointer to extra data is updated
4403*/
4404
4405static int
4406add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4407 int options, compile_data *cd, const pcre_uint32 *p)
4408{
4409BOOL utf = (options & PCRE_UTF8) != 0;
4410int n8 = 0;
4411if (p[0] > 0)
4412 n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4413while (p[0] < NOTACHAR)
4414 {
4415 while (p[1] == p[0] + 1) p++;
4416 n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4417 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4418 p++;
4419 }
4420return n8;
4421}
4422
4423
4424
4425/*************************************************
4426* Compile one branch *
4427*************************************************/
4428
4429/* Scan the pattern, compiling it into the a vector. If the options are
4430changed during the branch, the pointer is used to change the external options
4431bits. This function is used during the pre-compile phase when we are trying
4432to find out the amount of memory needed, as well as during the real compile
4433phase. The value of lengthptr distinguishes the two phases.
4434
4435Arguments:
4436 optionsptr pointer to the option bits
4437 codeptr points to the pointer to the current code point
4438 ptrptr points to the current pattern pointer
4439 errorcodeptr points to error code variable
4440 firstcharptr place to put the first required character
4441 firstcharflagsptr place to put the first character flags, or a negative number
4442 reqcharptr place to put the last required character
4443 reqcharflagsptr place to put the last required character flags, or a negative number
4444 bcptr points to current branch chain
4445 cond_depth conditional nesting depth
4446 cd contains pointers to tables etc.
4447 lengthptr NULL during the real compile phase
4448 points to length accumulator during pre-compile phase
4449
4450Returns: TRUE on success
4451 FALSE, with *errorcodeptr set non-zero on error
4452*/
4453
4454static BOOL
4455compile_branch(int *optionsptr, pcre_uchar **codeptr,
4456 const pcre_uchar **ptrptr, int *errorcodeptr,
4457 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4458 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4459 branch_chain *bcptr, int cond_depth,
4460 compile_data *cd, int *lengthptr)
4461{
4462int repeat_type, op_type;
4463int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
4464int bravalue = 0;
4465int greedy_default, greedy_non_default;
4466pcre_uint32 firstchar, reqchar;
4467pcre_int32 firstcharflags, reqcharflags;
4468pcre_uint32 zeroreqchar, zerofirstchar;
4469pcre_int32 zeroreqcharflags, zerofirstcharflags;
4470pcre_int32 req_caseopt, reqvary, tempreqvary;
4471int options = *optionsptr; /* May change dynamically */
4472int after_manual_callout = 0;
4473int length_prevgroup = 0;
4474register pcre_uint32 c;
4475int escape;
4476register pcre_uchar *code = *codeptr;
4477pcre_uchar *last_code = code;
4478pcre_uchar *orig_code = code;
4479pcre_uchar *tempcode;
4480BOOL inescq = FALSE;
4481BOOL groupsetfirstchar = FALSE;
4482const pcre_uchar *ptr = *ptrptr;
4483const pcre_uchar *tempptr;
4484const pcre_uchar *nestptr = NULL;
4485pcre_uchar *previous = NULL;
4486pcre_uchar *previous_callout = NULL;
4487size_t item_hwm_offset = 0;
4488pcre_uint8 classbits[32];
4489
4490/* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4491must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4492dynamically as we process the pattern. */
4493
4494#ifdef SUPPORT_UTF
4495/* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4496BOOL utf = (options & PCRE_UTF8) != 0;
4497#ifndef COMPILE_PCRE32
4498pcre_uchar utf_chars[6];
4499#endif
4500#else
4501BOOL utf = FALSE;
4502#endif
4503
4504/* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4505class_uchardata always so that it can be passed to add_to_class() always,
4506though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4507alternative calls for the different cases. */
4508
4509pcre_uchar *class_uchardata;
4510#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4511BOOL xclass;
4512pcre_uchar *class_uchardata_base;
4513#endif
4514
4515#ifdef PCRE_DEBUG
4516if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4517#endif
4518
4519/* Set up the default and non-default settings for greediness */
4520
4521greedy_default = ((options & PCRE_UNGREEDY) != 0);
4522greedy_non_default = greedy_default ^ 1;
4523
4524/* Initialize no first byte, no required byte. REQ_UNSET means "no char
4525matching encountered yet". It gets changed to REQ_NONE if we hit something that
4526matches a non-fixed char first char; reqchar just remains unset if we never
4527find one.
4528
4529When we hit a repeat whose minimum is zero, we may have to adjust these values
4530to take the zero repeat into account. This is implemented by setting them to
4531zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4532item types that can be repeated set these backoff variables appropriately. */
4533
4534firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4535firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4536
4537/* The variable req_caseopt contains either the REQ_CASELESS value
4538or zero, according to the current setting of the caseless flag. The
4539REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4540firstchar or reqchar variables to record the case status of the
4541value. This is used only for ASCII characters. */
4542
4543req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4544
4545/* Switch on next character until the end of the branch */
4546
4547for (;; ptr++)
4548 {
4549 BOOL negate_class;
4550 BOOL should_flip_negation;
4551 BOOL possessive_quantifier;
4552 BOOL is_quantifier;
4553 BOOL is_recurse;
4554 BOOL reset_bracount;
4555 int class_has_8bitchar;
4556 int class_one_char;
4557#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4558 BOOL xclass_has_prop;
4559#endif
4560 int newoptions;
4561 int recno;
4562 int refsign;
4563 int skipbytes;
4564 pcre_uint32 subreqchar, subfirstchar;
4565 pcre_int32 subreqcharflags, subfirstcharflags;
4566 int terminator;
4567 unsigned int mclength;
4568 unsigned int tempbracount;
4569 pcre_uint32 ec;
4570 pcre_uchar mcbuffer[8];
4571
4572 /* Come here to restart the loop without advancing the pointer. */
4573
4574 REDO_LOOP:
4575
4576 /* Get next character in the pattern */
4577
4578 c = *ptr;
4579
4580 /* If we are at the end of a nested substitution, revert to the outer level
4581 string. Nesting only happens one level deep. */
4582
4583 if (c == CHAR_NULL && nestptr != NULL)
4584 {
4585 ptr = nestptr;
4586 nestptr = NULL;
4587 c = *ptr;
4588 }
4589
4590 /* If we are in the pre-compile phase, accumulate the length used for the
4591 previous cycle of this loop. */
4592
4593 if (lengthptr != NULL)
4594 {
4595#ifdef PCRE_DEBUG
4596 if (code > cd->hwm) cd->hwm = code; /* High water info */
4597#endif
4598 if (code > cd->start_workspace + cd->workspace_size -
4599 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
4600 {
4601 *errorcodeptr = (code >= cd->start_workspace + cd->workspace_size)?
4602 ERR52 : ERR87;
4603 goto FAILED;
4604 }
4605
4606 /* There is at least one situation where code goes backwards: this is the
4607 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4608 the class is simply eliminated. However, it is created first, so we have to
4609 allow memory for it. Therefore, don't ever reduce the length at this point.
4610 */
4611
4612 if (code < last_code) code = last_code;
4613
4614 /* Paranoid check for integer overflow */
4615
4616 if (OFLOW_MAX - *lengthptr < code - last_code)
4617 {
4618 *errorcodeptr = ERR20;
4619 goto FAILED;
4620 }
4621
4622 *lengthptr += (int)(code - last_code);
4623 DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4624 (int)(code - last_code), c, c));
4625
4626 /* If "previous" is set and it is not at the start of the work space, move
4627 it back to there, in order to avoid filling up the work space. Otherwise,
4628 if "previous" is NULL, reset the current code pointer to the start. */
4629
4630 if (previous != NULL)
4631 {
4632 if (previous > orig_code)
4633 {
4634 memmove(orig_code, previous, IN_UCHARS(code - previous));
4635 code -= previous - orig_code;
4636 previous = orig_code;
4637 }
4638 }
4639 else code = orig_code;
4640
4641 /* Remember where this code item starts so we can pick up the length
4642 next time round. */
4643
4644 last_code = code;
4645 }
4646
4647 /* In the real compile phase, just check the workspace used by the forward
4648 reference list. */
4649
4650 else if (cd->hwm > cd->start_workspace + cd->workspace_size)
4651 {
4652 *errorcodeptr = ERR52;
4653 goto FAILED;
4654 }
4655
4656 /* If in \Q...\E, check for the end; if not, we have a literal. Otherwise an
4657 isolated \E is ignored. */
4658
4659 if (c != CHAR_NULL)
4660 {
4661 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4662 {
4663 inescq = FALSE;
4664 ptr++;
4665 continue;
4666 }
4667 else if (inescq)
4668 {
4669 if (previous_callout != NULL)
4670 {
4671 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4672 complete_callout(previous_callout, ptr, cd);
4673 previous_callout = NULL;
4674 }
4675 if ((options & PCRE_AUTO_CALLOUT) != 0)
4676 {
4677 previous_callout = code;
4678 code = auto_callout(code, ptr, cd);
4679 }
4680 goto NORMAL_CHAR;
4681 }
4682
4683 /* Check for the start of a \Q...\E sequence. We must do this here rather
4684 than later in case it is immediately followed by \E, which turns it into a
4685 "do nothing" sequence. */
4686
4687 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4688 {
4689 inescq = TRUE;
4690 ptr++;
4691 continue;
4692 }
4693 }
4694
4695 /* In extended mode, skip white space and comments. */
4696
4697 if ((options & PCRE_EXTENDED) != 0)
4698 {
4699 const pcre_uchar *wscptr = ptr;
4700 while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4701 if (c == CHAR_NUMBER_SIGN)
4702 {
4703 ptr++;
4704 while (*ptr != CHAR_NULL)
4705 {
4706 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
4707 { /* IS_NEWLINE sets cd->nllen. */
4708 ptr += cd->nllen;
4709 break;
4710 }
4711 ptr++;
4712#ifdef SUPPORT_UTF
4713 if (utf) FORWARDCHAR(ptr);
4714#endif
4715 }
4716 }
4717
4718 /* If we skipped any characters, restart the loop. Otherwise, we didn't see
4719 a comment. */
4720
4721 if (ptr > wscptr) goto REDO_LOOP;
4722 }
4723
4724 /* Skip over (?# comments. We need to do this here because we want to know if
4725 the next thing is a quantifier, and these comments may come between an item
4726 and its quantifier. */
4727
4728 if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK &&
4729 ptr[2] == CHAR_NUMBER_SIGN)
4730 {
4731 ptr += 3;
4732 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4733 if (*ptr == CHAR_NULL)
4734 {
4735 *errorcodeptr = ERR18;
4736 goto FAILED;
4737 }
4738 continue;
4739 }
4740
4741 /* See if the next thing is a quantifier. */
4742
4743 is_quantifier =
4744 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4745 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4746
4747 /* Fill in length of a previous callout, except when the next thing is a
4748 quantifier or when processing a property substitution string in UCP mode. */
4749
4750 if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4751 after_manual_callout-- <= 0)
4752 {
4753 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4754 complete_callout(previous_callout, ptr, cd);
4755 previous_callout = NULL;
4756 }
4757
4758 /* Create auto callout, except for quantifiers, or while processing property
4759 strings that are substituted for \w etc in UCP mode. */
4760
4761 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4762 {
4763 previous_callout = code;
4764 code = auto_callout(code, ptr, cd);
4765 }
4766
4767 /* Process the next pattern item. */
4768
4769 switch(c)
4770 {
4771 /* ===================================================================*/
4772 case CHAR_NULL: /* The branch terminates at string end */
4773 case CHAR_VERTICAL_LINE: /* or | or ) */
4774 case CHAR_RIGHT_PARENTHESIS:
4775 *firstcharptr = firstchar;
4776 *firstcharflagsptr = firstcharflags;
4777 *reqcharptr = reqchar;
4778 *reqcharflagsptr = reqcharflags;
4779 *codeptr = code;
4780 *ptrptr = ptr;
4781 if (lengthptr != NULL)
4782 {
4783 if (OFLOW_MAX - *lengthptr < code - last_code)
4784 {
4785 *errorcodeptr = ERR20;
4786 goto FAILED;
4787 }
4788 *lengthptr += (int)(code - last_code); /* To include callout length */
4789 DPRINTF((">> end branch\n"));
4790 }
4791 return TRUE;
4792
4793
4794 /* ===================================================================*/
4795 /* Handle single-character metacharacters. In multiline mode, ^ disables
4796 the setting of any following char as a first character. */
4797
4798 case CHAR_CIRCUMFLEX_ACCENT:
4799 previous = NULL;
4800 if ((options & PCRE_MULTILINE) != 0)
4801 {
4802 if (firstcharflags == REQ_UNSET)
4803 zerofirstcharflags = firstcharflags = REQ_NONE;
4804 *code++ = OP_CIRCM;
4805 }
4806 else *code++ = OP_CIRC;
4807 break;
4808
4809 case CHAR_DOLLAR_SIGN:
4810 previous = NULL;
4811 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4812 break;
4813
4814 /* There can never be a first char if '.' is first, whatever happens about
4815 repeats. The value of reqchar doesn't change either. */
4816
4817 case CHAR_DOT:
4818 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4819 zerofirstchar = firstchar;
4820 zerofirstcharflags = firstcharflags;
4821 zeroreqchar = reqchar;
4822 zeroreqcharflags = reqcharflags;
4823 previous = code;
4824 item_hwm_offset = cd->hwm - cd->start_workspace;
4825 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4826 break;
4827
4828
4829 /* ===================================================================*/
4830 /* Character classes. If the included characters are all < 256, we build a
4831 32-byte bitmap of the permitted characters, except in the special case
4832 where there is only one such character. For negated classes, we build the
4833 map as usual, then invert it at the end. However, we use a different opcode
4834 so that data characters > 255 can be handled correctly.
4835
4836 If the class contains characters outside the 0-255 range, a different
4837 opcode is compiled. It may optionally have a bit map for characters < 256,
4838 but those above are are explicitly listed afterwards. A flag byte tells
4839 whether the bitmap is present, and whether this is a negated class or not.
4840
4841 In JavaScript compatibility mode, an isolated ']' causes an error. In
4842 default (Perl) mode, it is treated as a data character. */
4843
4844 case CHAR_RIGHT_SQUARE_BRACKET:
4845 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4846 {
4847 *errorcodeptr = ERR64;
4848 goto FAILED;
4849 }
4850 goto NORMAL_CHAR;
4851
4852 /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4853 used for "start of word" and "end of word". As these are otherwise illegal
4854 sequences, we don't break anything by recognizing them. They are replaced
4855 by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4856 erroneous and are handled by the normal code below. */
4857
4858 case CHAR_LEFT_SQUARE_BRACKET:
4859 if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4860 {
4861 nestptr = ptr + 7;
4862 ptr = sub_start_of_word;
4863 goto REDO_LOOP;
4864 }
4865
4866 if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4867 {
4868 nestptr = ptr + 7;
4869 ptr = sub_end_of_word;
4870 goto REDO_LOOP;
4871 }
4872
4873 /* Handle a real character class. */
4874
4875 previous = code;
4876 item_hwm_offset = cd->hwm - cd->start_workspace;
4877
4878 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4879 they are encountered at the top level, so we'll do that too. */
4880
4881 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4882 ptr[1] == CHAR_EQUALS_SIGN) &&
4883 check_posix_syntax(ptr, &tempptr))
4884 {
4885 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4886 goto FAILED;
4887 }
4888
4889 /* If the first character is '^', set the negation flag and skip it. Also,
4890 if the first few characters (either before or after ^) are \Q\E or \E we
4891 skip them too. This makes for compatibility with Perl. */
4892
4893 negate_class = FALSE;
4894 for (;;)
4895 {
4896 c = *(++ptr);
4897 if (c == CHAR_BACKSLASH)
4898 {
4899 if (ptr[1] == CHAR_E)
4900 ptr++;
4901 else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4902 ptr += 3;
4903 else
4904 break;
4905 }
4906 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4907 negate_class = TRUE;
4908 else break;
4909 }
4910
4911 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4912 an initial ']' is taken as a data character -- the code below handles
4913 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4914 [^] must match any character, so generate OP_ALLANY. */
4915
4916 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4917 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4918 {
4919 *code++ = negate_class? OP_ALLANY : OP_FAIL;
4920 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4921 zerofirstchar = firstchar;
4922 zerofirstcharflags = firstcharflags;
4923 break;
4924 }
4925
4926 /* If a class contains a negative special such as \S, we need to flip the
4927 negation flag at the end, so that support for characters > 255 works
4928 correctly (they are all included in the class). */
4929
4930 should_flip_negation = FALSE;
4931
4932 /* Extended class (xclass) will be used when characters > 255
4933 might match. */
4934
4935#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4936 xclass = FALSE;
4937 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
4938 class_uchardata_base = class_uchardata; /* Save the start */
4939#endif
4940
4941 /* For optimization purposes, we track some properties of the class:
4942 class_has_8bitchar will be non-zero if the class contains at least one <
4943 256 character; class_one_char will be 1 if the class contains just one
4944 character; xclass_has_prop will be TRUE if unicode property checks
4945 are present in the class. */
4946
4947 class_has_8bitchar = 0;
4948 class_one_char = 0;
4949#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4950 xclass_has_prop = FALSE;
4951#endif
4952
4953 /* Initialize the 32-char bit map to all zeros. We build the map in a
4954 temporary bit of memory, in case the class contains fewer than two
4955 8-bit characters because in that case the compiled code doesn't use the bit
4956 map. */
4957
4958 memset(classbits, 0, 32 * sizeof(pcre_uint8));
4959
4960 /* Process characters until ] is reached. By writing this as a "do" it
4961 means that an initial ] is taken as a data character. At the start of the
4962 loop, c contains the first byte of the character. */
4963
4964 if (c != CHAR_NULL) do
4965 {
4966 const pcre_uchar *oldptr;
4967
4968#ifdef SUPPORT_UTF
4969 if (utf && HAS_EXTRALEN(c))
4970 { /* Braces are required because the */
4971 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
4972 }
4973#endif
4974
4975#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4976 /* In the pre-compile phase, accumulate the length of any extra
4977 data and reset the pointer. This is so that very large classes that
4978 contain a zillion > 255 characters no longer overwrite the work space
4979 (which is on the stack). We have to remember that there was XCLASS data,
4980 however. */
4981
4982 if (class_uchardata > class_uchardata_base) xclass = TRUE;
4983
4984 if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4985 {
4986 *lengthptr += (int)(class_uchardata - class_uchardata_base);
4987 class_uchardata = class_uchardata_base;
4988 }
4989#endif
4990
4991 /* Inside \Q...\E everything is literal except \E */
4992
4993 if (inescq)
4994 {
4995 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
4996 {
4997 inescq = FALSE; /* Reset literal state */
4998 ptr++; /* Skip the 'E' */
4999 continue; /* Carry on with next */
5000 }
5001 goto CHECK_RANGE; /* Could be range if \E follows */
5002 }
5003
5004 /* Handle POSIX class names. Perl allows a negation extension of the
5005 form [:^name:]. A square bracket that doesn't match the syntax is
5006 treated as a literal. We also recognize the POSIX constructions
5007 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
5008 5.6 and 5.8 do. */
5009
5010 if (c == CHAR_LEFT_SQUARE_BRACKET &&
5011 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5012 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
5013 {
5014 BOOL local_negate = FALSE;
5015 int posix_class, taboffset, tabopt;
5016 register const pcre_uint8 *cbits = cd->cbits;
5017 pcre_uint8 pbits[32];
5018
5019 if (ptr[1] != CHAR_COLON)
5020 {
5021 *errorcodeptr = ERR31;
5022 goto FAILED;
5023 }
5024
5025 ptr += 2;
5026 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
5027 {
5028 local_negate = TRUE;
5029 should_flip_negation = TRUE; /* Note negative special */
5030 ptr++;
5031 }
5032
5033 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
5034 if (posix_class < 0)
5035 {
5036 *errorcodeptr = ERR30;
5037 goto FAILED;
5038 }
5039
5040 /* If matching is caseless, upper and lower are converted to
5041 alpha. This relies on the fact that the class table starts with
5042 alpha, lower, upper as the first 3 entries. */
5043
5044 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
5045 posix_class = 0;
5046
5047 /* When PCRE_UCP is set, some of the POSIX classes are converted to
5048 different escape sequences that use Unicode properties \p or \P. Others
5049 that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
5050 directly. */
5051
5052#ifdef SUPPORT_UCP
5053 if ((options & PCRE_UCP) != 0)
5054 {
5055 unsigned int ptype = 0;
5056 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
5057
5058 /* The posix_substitutes table specifies which POSIX classes can be
5059 converted to \p or \P items. */
5060
5061 if (posix_substitutes[pc] != NULL)
5062 {
5063 nestptr = tempptr + 1;
5064 ptr = posix_substitutes[pc] - 1;
5065 continue;
5066 }
5067
5068 /* There are three other classes that generate special property calls
5069 that are recognized only in an XCLASS. */
5070
5071 else switch(posix_class)
5072 {
5073 case PC_GRAPH:
5074 ptype = PT_PXGRAPH;
5075 /* Fall through */
5076 case PC_PRINT:
5077 if (ptype == 0) ptype = PT_PXPRINT;
5078 /* Fall through */
5079 case PC_PUNCT:
5080 if (ptype == 0) ptype = PT_PXPUNCT;
5081 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5082 *class_uchardata++ = ptype;
5083 *class_uchardata++ = 0;
5084 xclass_has_prop = TRUE;
5085 ptr = tempptr + 1;
5086 continue;
5087
5088 /* For the other POSIX classes (ascii, cntrl, xdigit) we are going
5089 to fall through to the non-UCP case and build a bit map for
5090 characters with code points less than 256. If we are in a negated
5091 POSIX class, characters with code points greater than 255 must
5092 either all match or all not match. In the special case where we
5093 have not yet generated any xclass data, and this is the final item
5094 in the overall class, we need do nothing: later on, the opcode
5095 OP_NCLASS will be used to indicate that characters greater than 255
5096 are acceptable. If we have already seen an xclass item or one may
5097 follow (we have to assume that it might if this is not the end of
5098 the class), explicitly list all wide codepoints, which will then
5099 either not match or match, depending on whether the class is or is
5100 not negated. */
5101
5102 default:
5103 if (local_negate &&
5104 (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
5105 {
5106 *class_uchardata++ = XCL_RANGE;
5107 class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5108 class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5109 }
5110 break;
5111 }
5112 }
5113#endif
5114 /* In the non-UCP case, or when UCP makes no difference, we build the
5115 bit map for the POSIX class in a chunk of local store because we may be
5116 adding and subtracting from it, and we don't want to subtract bits that
5117 may be in the main map already. At the end we or the result into the
5118 bit map that is being built. */
5119
5120 posix_class *= 3;
5121
5122 /* Copy in the first table (always present) */
5123
5124 memcpy(pbits, cbits + posix_class_maps[posix_class],
5125 32 * sizeof(pcre_uint8));
5126
5127 /* If there is a second table, add or remove it as required. */
5128
5129 taboffset = posix_class_maps[posix_class + 1];
5130 tabopt = posix_class_maps[posix_class + 2];
5131
5132 if (taboffset >= 0)
5133 {
5134 if (tabopt >= 0)
5135 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
5136 else
5137 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
5138 }
5139
5140 /* Now see if we need to remove any special characters. An option
5141 value of 1 removes vertical space and 2 removes underscore. */
5142
5143 if (tabopt < 0) tabopt = -tabopt;
5144 if (tabopt == 1) pbits[1] &= ~0x3c;
5145 else if (tabopt == 2) pbits[11] &= 0x7f;
5146
5147 /* Add the POSIX table or its complement into the main table that is
5148 being built and we are done. */
5149
5150 if (local_negate)
5151 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
5152 else
5153 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
5154
5155 ptr = tempptr + 1;
5156 /* Every class contains at least one < 256 character. */
5157 class_has_8bitchar = 1;
5158 /* Every class contains at least two characters. */
5159 class_one_char = 2;
5160 continue; /* End of POSIX syntax handling */
5161 }
5162
5163 /* Backslash may introduce a single character, or it may introduce one
5164 of the specials, which just set a flag. The sequence \b is a special
5165 case. Inside a class (and only there) it is treated as backspace. We
5166 assume that other escapes have more than one character in them, so
5167 speculatively set both class_has_8bitchar and class_one_char bigger
5168 than one. Unrecognized escapes fall through and are either treated
5169 as literal characters (by default), or are faulted if
5170 PCRE_EXTRA is set. */
5171
5172 if (c == CHAR_BACKSLASH)
5173 {
5174 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
5175 TRUE);
5176 if (*errorcodeptr != 0) goto FAILED;
5177 if (escape == 0) c = ec;
5178 else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
5179 else if (escape == ESC_N) /* \N is not supported in a class */
5180 {
5181 *errorcodeptr = ERR71;
5182 goto FAILED;
5183 }
5184 else if (escape == ESC_Q) /* Handle start of quoted string */
5185 {
5186 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5187 {
5188 ptr += 2; /* avoid empty string */
5189 }
5190 else inescq = TRUE;
5191 continue;
5192 }
5193 else if (escape == ESC_E) continue; /* Ignore orphan \E */
5194
5195 else
5196 {
5197 register const pcre_uint8 *cbits = cd->cbits;
5198 /* Every class contains at least two < 256 characters. */
5199 class_has_8bitchar++;
5200 /* Every class contains at least two characters. */
5201 class_one_char += 2;
5202
5203 switch (escape)
5204 {
5205#ifdef SUPPORT_UCP
5206 case ESC_du: /* These are the values given for \d etc */
5207 case ESC_DU: /* when PCRE_UCP is set. We replace the */
5208 case ESC_wu: /* escape sequence with an appropriate \p */
5209 case ESC_WU: /* or \P to test Unicode properties instead */
5210 case ESC_su: /* of the default ASCII testing. */
5211 case ESC_SU:
5212 nestptr = ptr;
5213 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
5214 class_has_8bitchar--; /* Undo! */
5215 continue;
5216#endif
5217 case ESC_d:
5218 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
5219 continue;
5220
5221 case ESC_D:
5222 should_flip_negation = TRUE;
5223 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5224 continue;
5225
5226 case ESC_w:
5227 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5228 continue;
5229
5230 case ESC_W:
5231 should_flip_negation = TRUE;
5232 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5233 continue;
5234
5235 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5236 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5237 previously set by something earlier in the character class.
5238 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5239 we could just adjust the appropriate bit. From PCRE 8.34 we no
5240 longer treat \s and \S specially. */
5241
5242 case ESC_s:
5243 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5244 continue;
5245
5246 case ESC_S:
5247 should_flip_negation = TRUE;
5248 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5249 continue;
5250
5251 /* The rest apply in both UCP and non-UCP cases. */
5252
5253 case ESC_h:
5254 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5255 PRIV(hspace_list), NOTACHAR);
5256 continue;
5257
5258 case ESC_H:
5259 (void)add_not_list_to_class(classbits, &class_uchardata, options,
5260 cd, PRIV(hspace_list));
5261 continue;
5262
5263 case ESC_v:
5264 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5265 PRIV(vspace_list), NOTACHAR);
5266 continue;
5267
5268 case ESC_V:
5269 (void)add_not_list_to_class(classbits, &class_uchardata, options,
5270 cd, PRIV(vspace_list));
5271 continue;
5272
5273 case ESC_p:
5274 case ESC_P:
5275#ifdef SUPPORT_UCP
5276 {
5277 BOOL negated;
5278 unsigned int ptype = 0, pdata = 0;
5279 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5280 goto FAILED;
5281 *class_uchardata++ = ((escape == ESC_p) != negated)?
5282 XCL_PROP : XCL_NOTPROP;
5283 *class_uchardata++ = ptype;
5284 *class_uchardata++ = pdata;
5285 xclass_has_prop = TRUE;
5286 class_has_8bitchar--; /* Undo! */
5287 continue;
5288 }
5289#else
5290 *errorcodeptr = ERR45;
5291 goto FAILED;
5292#endif
5293 /* Unrecognized escapes are faulted if PCRE is running in its
5294 strict mode. By default, for compatibility with Perl, they are
5295 treated as literals. */
5296
5297 default:
5298 if ((options & PCRE_EXTRA) != 0)
5299 {
5300 *errorcodeptr = ERR7;
5301 goto FAILED;
5302 }
5303 class_has_8bitchar--; /* Undo the speculative increase. */
5304 class_one_char -= 2; /* Undo the speculative increase. */
5305 c = *ptr; /* Get the final character and fall through */
5306 break;
5307 }
5308 }
5309
5310 /* Fall through if the escape just defined a single character (c >= 0).
5311 This may be greater than 256. */
5312
5313 escape = 0;
5314
5315 } /* End of backslash handling */
5316
5317 /* A character may be followed by '-' to form a range. However, Perl does
5318 not permit ']' to be the end of the range. A '-' character at the end is
5319 treated as a literal. Perl ignores orphaned \E sequences entirely. The
5320 code for handling \Q and \E is messy. */
5321
5322 CHECK_RANGE:
5323 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5324 {
5325 inescq = FALSE;
5326 ptr += 2;
5327 }
5328 oldptr = ptr;
5329
5330 /* Remember if \r or \n were explicitly used */
5331
5332 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5333
5334 /* Check for range */
5335
5336 if (!inescq && ptr[1] == CHAR_MINUS)
5337 {
5338 pcre_uint32 d;
5339 ptr += 2;
5340 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
5341
5342 /* If we hit \Q (not followed by \E) at this point, go into escaped
5343 mode. */
5344
5345 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
5346 {
5347 ptr += 2;
5348 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
5349 { ptr += 2; continue; }
5350 inescq = TRUE;
5351 break;
5352 }
5353
5354 /* Minus (hyphen) at the end of a class is treated as a literal, so put
5355 back the pointer and jump to handle the character that preceded it. */
5356
5357 if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
5358 {
5359 ptr = oldptr;
5360 goto CLASS_SINGLE_CHARACTER;
5361 }
5362
5363 /* Otherwise, we have a potential range; pick up the next character */
5364
5365#ifdef SUPPORT_UTF
5366 if (utf)
5367 { /* Braces are required because the */
5368 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
5369 }
5370 else
5371#endif
5372 d = *ptr; /* Not UTF-8 mode */
5373
5374 /* The second part of a range can be a single-character escape
5375 sequence, but not any of the other escapes. Perl treats a hyphen as a
5376 literal in such circumstances. However, in Perl's warning mode, a
5377 warning is given, so PCRE now faults it as it is almost certainly a
5378 mistake on the user's part. */
5379
5380 if (!inescq)
5381 {
5382 if (d == CHAR_BACKSLASH)
5383 {
5384 int descape;
5385 descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5386 if (*errorcodeptr != 0) goto FAILED;
5387
5388 /* 0 means a character was put into d; \b is backspace; any other
5389 special causes an error. */
5390
5391 if (descape != 0)
5392 {
5393 if (descape == ESC_b) d = CHAR_BS; else
5394 {
5395 *errorcodeptr = ERR83;
5396 goto FAILED;
5397 }
5398 }
5399 }
5400
5401 /* A hyphen followed by a POSIX class is treated in the same way. */
5402
5403 else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5404 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5405 ptr[1] == CHAR_EQUALS_SIGN) &&
5406 check_posix_syntax(ptr, &tempptr))
5407 {
5408 *errorcodeptr = ERR83;
5409 goto FAILED;
5410 }
5411 }
5412
5413 /* Check that the two values are in the correct order. Optimize
5414 one-character ranges. */
5415
5416 if (d < c)
5417 {
5418 *errorcodeptr = ERR8;
5419 goto FAILED;
5420 }
5421 if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */
5422
5423 /* We have found a character range, so single character optimizations
5424 cannot be done anymore. Any value greater than 1 indicates that there
5425 is more than one character. */
5426
5427 class_one_char = 2;
5428
5429 /* Remember an explicit \r or \n, and add the range to the class. */
5430
5431 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5432
5433 class_has_8bitchar +=
5434 add_to_class(classbits, &class_uchardata, options, cd, c, d);
5435
5436 continue; /* Go get the next char in the class */
5437 }
5438
5439 /* Handle a single character - we can get here for a normal non-escape
5440 char, or after \ that introduces a single character or for an apparent
5441 range that isn't. Only the value 1 matters for class_one_char, so don't
5442 increase it if it is already 2 or more ... just in case there's a class
5443 with a zillion characters in it. */
5444
5445 CLASS_SINGLE_CHARACTER:
5446 if (class_one_char < 2) class_one_char++;
5447
5448 /* If xclass_has_prop is false and class_one_char is 1, we have the first
5449 single character in the class, and there have been no prior ranges, or
5450 XCLASS items generated by escapes. If this is the final character in the
5451 class, we can optimize by turning the item into a 1-character OP_CHAR[I]
5452 if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
5453 can cause firstchar to be set. Otherwise, there can be no first char if
5454 this item is first, whatever repeat count may follow. In the case of
5455 reqchar, save the previous value for reinstating. */
5456
5457 if (!inescq &&
5458#ifdef SUPPORT_UCP
5459 !xclass_has_prop &&
5460#endif
5461 class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5462 {
5463 ptr++;
5464 zeroreqchar = reqchar;
5465 zeroreqcharflags = reqcharflags;
5466
5467 if (negate_class)
5468 {
5469#ifdef SUPPORT_UCP
5470 int d;
5471#endif
5472 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5473 zerofirstchar = firstchar;
5474 zerofirstcharflags = firstcharflags;
5475
5476 /* For caseless UTF-8 mode when UCP support is available, check
5477 whether this character has more than one other case. If so, generate
5478 a special OP_NOTPROP item instead of OP_NOTI. */
5479
5480#ifdef SUPPORT_UCP
5481 if (utf && (options & PCRE_CASELESS) != 0 &&
5482 (d = UCD_CASESET(c)) != 0)
5483 {
5484 *code++ = OP_NOTPROP;
5485 *code++ = PT_CLIST;
5486 *code++ = d;
5487 }
5488 else
5489#endif
5490 /* Char has only one other case, or UCP not available */
5491
5492 {
5493 *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5494#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5495 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5496 code += PRIV(ord2utf)(c, code);
5497 else
5498#endif
5499 *code++ = c;
5500 }
5501
5502 /* We are finished with this character class */
5503
5504 goto END_CLASS;
5505 }
5506
5507 /* For a single, positive character, get the value into mcbuffer, and
5508 then we can handle this with the normal one-character code. */
5509
5510#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5511 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5512 mclength = PRIV(ord2utf)(c, mcbuffer);
5513 else
5514#endif
5515 {
5516 mcbuffer[0] = c;
5517 mclength = 1;
5518 }
5519 goto ONE_CHAR;
5520 } /* End of 1-char optimization */
5521
5522 /* There is more than one character in the class, or an XCLASS item
5523 has been generated. Add this character to the class. */
5524
5525 class_has_8bitchar +=
5526 add_to_class(classbits, &class_uchardata, options, cd, c, c);
5527 }
5528
5529 /* Loop until ']' reached. This "while" is the end of the "do" far above.
5530 If we are at the end of an internal nested string, revert to the outer
5531 string. */
5532
5533 while (((c = *(++ptr)) != CHAR_NULL ||
5534 (nestptr != NULL &&
5535 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5536 (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5537
5538 /* Check for missing terminating ']' */
5539
5540 if (c == CHAR_NULL)
5541 {
5542 *errorcodeptr = ERR6;
5543 goto FAILED;
5544 }
5545
5546 /* We will need an XCLASS if data has been placed in class_uchardata. In
5547 the second phase this is a sufficient test. However, in the pre-compile
5548 phase, class_uchardata gets emptied to prevent workspace overflow, so it
5549 only if the very last character in the class needs XCLASS will it contain
5550 anything at this point. For this reason, xclass gets set TRUE above when
5551 uchar_classdata is emptied, and that's why this code is the way it is here
5552 instead of just doing a test on class_uchardata below. */
5553
5554#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5555 if (class_uchardata > class_uchardata_base) xclass = TRUE;
5556#endif
5557
5558 /* If this is the first thing in the branch, there can be no first char
5559 setting, whatever the repeat count. Any reqchar setting must remain
5560 unchanged after any kind of repeat. */
5561
5562 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5563 zerofirstchar = firstchar;
5564 zerofirstcharflags = firstcharflags;
5565 zeroreqchar = reqchar;
5566 zeroreqcharflags = reqcharflags;
5567
5568 /* If there are characters with values > 255, we have to compile an
5569 extended class, with its own opcode, unless there was a negated special
5570 such as \S in the class, and PCRE_UCP is not set, because in that case all
5571 characters > 255 are in the class, so any that were explicitly given as
5572 well can be ignored. If (when there are explicit characters > 255 that must
5573 be listed) there are no characters < 256, we can omit the bitmap in the
5574 actual compiled code. */
5575
5576#ifdef SUPPORT_UTF
5577 if (xclass && (xclass_has_prop || !should_flip_negation ||
5578 (options & PCRE_UCP) != 0))
5579#elif !defined COMPILE_PCRE8
5580 if (xclass && (xclass_has_prop || !should_flip_negation))
5581#endif
5582#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5583 {
5584 /* For non-UCP wide characters, in a non-negative class containing \S or
5585 similar (should_flip_negation is set), all characters greater than 255
5586 must be in the class. */
5587
5588 if (
5589#if defined COMPILE_PCRE8
5590 utf &&
5591#endif
5592 should_flip_negation && !negate_class && (options & PCRE_UCP) == 0)
5593 {
5594 *class_uchardata++ = XCL_RANGE;
5595 if (utf) /* Will always be utf in the 8-bit library */
5596 {
5597 class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5598 class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5599 }
5600 else /* Can only happen for the 16-bit & 32-bit libraries */
5601 {
5602#if defined COMPILE_PCRE16
5603 *class_uchardata++ = 0x100;
5604 *class_uchardata++ = 0xffffu;
5605#elif defined COMPILE_PCRE32
5606 *class_uchardata++ = 0x100;
5607 *class_uchardata++ = 0xffffffffu;
5608#endif
5609 }
5610 }
5611
5612 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
5613 *code++ = OP_XCLASS;
5614 code += LINK_SIZE;
5615 *code = negate_class? XCL_NOT:0;
5616 if (xclass_has_prop) *code |= XCL_HASPROP;
5617
5618 /* If the map is required, move up the extra data to make room for it;
5619 otherwise just move the code pointer to the end of the extra data. */
5620
5621 if (class_has_8bitchar > 0)
5622 {
5623 *code++ |= XCL_MAP;
5624 memmove(code + (32 / sizeof(pcre_uchar)), code,
5625 IN_UCHARS(class_uchardata - code));
5626 if (negate_class && !xclass_has_prop)
5627 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5628 memcpy(code, classbits, 32);
5629 code = class_uchardata + (32 / sizeof(pcre_uchar));
5630 }
5631 else code = class_uchardata;
5632
5633 /* Now fill in the complete length of the item */
5634
5635 PUT(previous, 1, (int)(code - previous));
5636 break; /* End of class handling */
5637 }
5638
5639 /* Even though any XCLASS list is now discarded, we must allow for
5640 its memory. */
5641
5642 if (lengthptr != NULL)
5643 *lengthptr += (int)(class_uchardata - class_uchardata_base);
5644#endif
5645
5646 /* If there are no characters > 255, or they are all to be included or
5647 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5648 whole class was negated and whether there were negative specials such as \S
5649 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5650 negating it if necessary. */
5651
5652 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5653 if (lengthptr == NULL) /* Save time in the pre-compile phase */
5654 {
5655 if (negate_class)
5656 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5657 memcpy(code, classbits, 32);
5658 }
5659 code += 32 / sizeof(pcre_uchar);
5660
5661 END_CLASS:
5662 break;
5663
5664
5665 /* ===================================================================*/
5666 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5667 has been tested above. */
5668
5669 case CHAR_LEFT_CURLY_BRACKET:
5670 if (!is_quantifier) goto NORMAL_CHAR;
5671 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5672 if (*errorcodeptr != 0) goto FAILED;
5673 goto REPEAT;
5674
5675 case CHAR_ASTERISK:
5676 repeat_min = 0;
5677 repeat_max = -1;
5678 goto REPEAT;
5679
5680 case CHAR_PLUS:
5681 repeat_min = 1;
5682 repeat_max = -1;
5683 goto REPEAT;
5684
5685 case CHAR_QUESTION_MARK:
5686 repeat_min = 0;
5687 repeat_max = 1;
5688
5689 REPEAT:
5690 if (previous == NULL)
5691 {
5692 *errorcodeptr = ERR9;
5693 goto FAILED;
5694 }
5695
5696 if (repeat_min == 0)
5697 {
5698 firstchar = zerofirstchar; /* Adjust for zero repeat */
5699 firstcharflags = zerofirstcharflags;
5700 reqchar = zeroreqchar; /* Ditto */
5701 reqcharflags = zeroreqcharflags;
5702 }
5703
5704 /* Remember whether this is a variable length repeat */
5705
5706 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5707
5708 op_type = 0; /* Default single-char op codes */
5709 possessive_quantifier = FALSE; /* Default not possessive quantifier */
5710
5711 /* Save start of previous item, in case we have to move it up in order to
5712 insert something before it. */
5713
5714 tempcode = previous;
5715
5716 /* Before checking for a possessive quantifier, we must skip over
5717 whitespace and comments in extended mode because Perl allows white space at
5718 this point. */
5719
5720 if ((options & PCRE_EXTENDED) != 0)
5721 {
5722 const pcre_uchar *p = ptr + 1;
5723 for (;;)
5724 {
5725 while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5726 if (*p != CHAR_NUMBER_SIGN) break;
5727 p++;
5728 while (*p != CHAR_NULL)
5729 {
5730 if (IS_NEWLINE(p)) /* For non-fixed-length newline cases, */
5731 { /* IS_NEWLINE sets cd->nllen. */
5732 p += cd->nllen;
5733 break;
5734 }
5735 p++;
5736#ifdef SUPPORT_UTF
5737 if (utf) FORWARDCHAR(p);
5738#endif
5739 } /* Loop for comment characters */
5740 } /* Loop for multiple comments */
5741 ptr = p - 1; /* Character before the next significant one. */
5742 }
5743
5744 /* We also need to skip over (?# comments, which are not dependent on
5745 extended mode. */
5746
5747 if (ptr[1] == CHAR_LEFT_PARENTHESIS && ptr[2] == CHAR_QUESTION_MARK &&
5748 ptr[3] == CHAR_NUMBER_SIGN)
5749 {
5750 ptr += 4;
5751 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5752 if (*ptr == CHAR_NULL)
5753 {
5754 *errorcodeptr = ERR18;
5755 goto FAILED;
5756 }
5757 }
5758
5759 /* If the next character is '+', we have a possessive quantifier. This
5760 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5761 If the next character is '?' this is a minimizing repeat, by default,
5762 but if PCRE_UNGREEDY is set, it works the other way round. We change the
5763 repeat type to the non-default. */
5764
5765 if (ptr[1] == CHAR_PLUS)
5766 {
5767 repeat_type = 0; /* Force greedy */
5768 possessive_quantifier = TRUE;
5769 ptr++;
5770 }
5771 else if (ptr[1] == CHAR_QUESTION_MARK)
5772 {
5773 repeat_type = greedy_non_default;
5774 ptr++;
5775 }
5776 else repeat_type = greedy_default;
5777
5778 /* If previous was a recursion call, wrap it in atomic brackets so that
5779 previous becomes the atomic group. All recursions were so wrapped in the
5780 past, but it no longer happens for non-repeated recursions. In fact, the
5781 repeated ones could be re-implemented independently so as not to need this,
5782 but for the moment we rely on the code for repeating groups. */
5783
5784 if (*previous == OP_RECURSE)
5785 {
5786 memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5787 *previous = OP_ONCE;
5788 PUT(previous, 1, 2 + 2*LINK_SIZE);
5789 previous[2 + 2*LINK_SIZE] = OP_KET;
5790 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5791 code += 2 + 2 * LINK_SIZE;
5792 length_prevgroup = 3 + 3*LINK_SIZE;
5793
5794 /* When actually compiling, we need to check whether this was a forward
5795 reference, and if so, adjust the offset. */
5796
5797 if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5798 {
5799 int offset = GET(cd->hwm, -LINK_SIZE);
5800 if (offset == previous + 1 - cd->start_code)
5801 PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5802 }
5803 }
5804
5805 /* Now handle repetition for the different types of item. */
5806
5807 /* If previous was a character or negated character match, abolish the item
5808 and generate a repeat item instead. If a char item has a minimum of more
5809 than one, ensure that it is set in reqchar - it might not be if a sequence
5810 such as x{3} is the first thing in a branch because the x will have gone
5811 into firstchar instead. */
5812
5813 if (*previous == OP_CHAR || *previous == OP_CHARI
5814 || *previous == OP_NOT || *previous == OP_NOTI)
5815 {
5816 switch (*previous)
5817 {
5818 default: /* Make compiler happy. */
5819 case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
5820 case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5821 case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
5822 case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
5823 }
5824
5825 /* Deal with UTF characters that take up more than one character. It's
5826 easier to write this out separately than try to macrify it. Use c to
5827 hold the length of the character in bytes, plus UTF_LENGTH to flag that
5828 it's a length rather than a small character. */
5829
5830#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5831 if (utf && NOT_FIRSTCHAR(code[-1]))
5832 {
5833 pcre_uchar *lastchar = code - 1;
5834 BACKCHAR(lastchar);
5835 c = (int)(code - lastchar); /* Length of UTF-8 character */
5836 memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5837 c |= UTF_LENGTH; /* Flag c as a length */
5838 }
5839 else
5840#endif /* SUPPORT_UTF */
5841
5842 /* Handle the case of a single charater - either with no UTF support, or
5843 with UTF disabled, or for a single character UTF character. */
5844 {
5845 c = code[-1];
5846 if (*previous <= OP_CHARI && repeat_min > 1)
5847 {
5848 reqchar = c;
5849 reqcharflags = req_caseopt | cd->req_varyopt;
5850 }
5851 }
5852
5853 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
5854 }
5855
5856 /* If previous was a character type match (\d or similar), abolish it and
5857 create a suitable repeat item. The code is shared with single-character
5858 repeats by setting op_type to add a suitable offset into repeat_type. Note
5859 the the Unicode property types will be present only when SUPPORT_UCP is
5860 defined, but we don't wrap the little bits of code here because it just
5861 makes it horribly messy. */
5862
5863 else if (*previous < OP_EODN)
5864 {
5865 pcre_uchar *oldcode;
5866 int prop_type, prop_value;
5867 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
5868 c = *previous;
5869
5870 OUTPUT_SINGLE_REPEAT:
5871 if (*previous == OP_PROP || *previous == OP_NOTPROP)
5872 {
5873 prop_type = previous[1];
5874 prop_value = previous[2];
5875 }
5876 else prop_type = prop_value = -1;
5877
5878 oldcode = code;
5879 code = previous; /* Usually overwrite previous item */
5880
5881 /* If the maximum is zero then the minimum must also be zero; Perl allows
5882 this case, so we do too - by simply omitting the item altogether. */
5883
5884 if (repeat_max == 0) goto END_REPEAT;
5885
5886 /* Combine the op_type with the repeat_type */
5887
5888 repeat_type += op_type;
5889
5890 /* A minimum of zero is handled either as the special case * or ?, or as
5891 an UPTO, with the maximum given. */
5892
5893 if (repeat_min == 0)
5894 {
5895 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5896 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5897 else
5898 {
5899 *code++ = OP_UPTO + repeat_type;
5900 PUT2INC(code, 0, repeat_max);
5901 }
5902 }
5903
5904 /* A repeat minimum of 1 is optimized into some special cases. If the
5905 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5906 left in place and, if the maximum is greater than 1, we use OP_UPTO with
5907 one less than the maximum. */
5908
5909 else if (repeat_min == 1)
5910 {
5911 if (repeat_max == -1)
5912 *code++ = OP_PLUS + repeat_type;
5913 else
5914 {
5915 code = oldcode; /* leave previous item in place */
5916 if (repeat_max == 1) goto END_REPEAT;
5917 *code++ = OP_UPTO + repeat_type;
5918 PUT2INC(code, 0, repeat_max - 1);
5919 }
5920 }
5921
5922 /* The case {n,n} is just an EXACT, while the general case {n,m} is
5923 handled as an EXACT followed by an UPTO. */
5924
5925 else
5926 {
5927 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
5928 PUT2INC(code, 0, repeat_min);
5929
5930 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5931 we have to insert the character for the previous code. For a repeated
5932 Unicode property match, there are two extra bytes that define the
5933 required property. In UTF-8 mode, long characters have their length in
5934 c, with the UTF_LENGTH bit as a flag. */
5935
5936 if (repeat_max < 0)
5937 {
5938#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5939 if (utf && (c & UTF_LENGTH) != 0)
5940 {
5941 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5942 code += c & 7;
5943 }
5944 else
5945#endif
5946 {
5947 *code++ = c;
5948 if (prop_type >= 0)
5949 {
5950 *code++ = prop_type;
5951 *code++ = prop_value;
5952 }
5953 }
5954 *code++ = OP_STAR + repeat_type;
5955 }
5956
5957 /* Else insert an UPTO if the max is greater than the min, again
5958 preceded by the character, for the previously inserted code. If the
5959 UPTO is just for 1 instance, we can use QUERY instead. */
5960
5961 else if (repeat_max != repeat_min)
5962 {
5963#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5964 if (utf && (c & UTF_LENGTH) != 0)
5965 {
5966 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5967 code += c & 7;
5968 }
5969 else
5970#endif
5971 *code++ = c;
5972 if (prop_type >= 0)
5973 {
5974 *code++ = prop_type;
5975 *code++ = prop_value;
5976 }
5977 repeat_max -= repeat_min;
5978
5979 if (repeat_max == 1)
5980 {
5981 *code++ = OP_QUERY + repeat_type;
5982 }
5983 else
5984 {
5985 *code++ = OP_UPTO + repeat_type;
5986 PUT2INC(code, 0, repeat_max);
5987 }
5988 }
5989 }
5990
5991 /* The character or character type itself comes last in all cases. */
5992
5993#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5994 if (utf && (c & UTF_LENGTH) != 0)
5995 {
5996 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5997 code += c & 7;
5998 }
5999 else
6000#endif
6001 *code++ = c;
6002
6003 /* For a repeated Unicode property match, there are two extra bytes that
6004 define the required property. */
6005
6006#ifdef SUPPORT_UCP
6007 if (prop_type >= 0)
6008 {
6009 *code++ = prop_type;
6010 *code++ = prop_value;
6011 }
6012#endif
6013 }
6014
6015 /* If previous was a character class or a back reference, we put the repeat
6016 stuff after it, but just skip the item if the repeat was {0,0}. */
6017
6018 else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
6019#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6020 *previous == OP_XCLASS ||
6021#endif
6022 *previous == OP_REF || *previous == OP_REFI ||
6023 *previous == OP_DNREF || *previous == OP_DNREFI)
6024 {
6025 if (repeat_max == 0)
6026 {
6027 code = previous;
6028 goto END_REPEAT;
6029 }
6030
6031 if (repeat_min == 0 && repeat_max == -1)
6032 *code++ = OP_CRSTAR + repeat_type;
6033 else if (repeat_min == 1 && repeat_max == -1)
6034 *code++ = OP_CRPLUS + repeat_type;
6035 else if (repeat_min == 0 && repeat_max == 1)
6036 *code++ = OP_CRQUERY + repeat_type;
6037 else
6038 {
6039 *code++ = OP_CRRANGE + repeat_type;
6040 PUT2INC(code, 0, repeat_min);
6041 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
6042 PUT2INC(code, 0, repeat_max);
6043 }
6044 }
6045
6046 /* If previous was a bracket group, we may have to replicate it in certain
6047 cases. Note that at this point we can encounter only the "basic" bracket
6048 opcodes such as BRA and CBRA, as this is the place where they get converted
6049 into the more special varieties such as BRAPOS and SBRA. A test for >=
6050 OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
6051 ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
6052 Originally, PCRE did not allow repetition of assertions, but now it does,
6053 for Perl compatibility. */
6054
6055 else if (*previous >= OP_ASSERT && *previous <= OP_COND)
6056 {
6057 register int i;
6058 int len = (int)(code - previous);
6059 size_t base_hwm_offset = item_hwm_offset;
6060 pcre_uchar *bralink = NULL;
6061 pcre_uchar *brazeroptr = NULL;
6062
6063 /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
6064 we just ignore the repeat. */
6065
6066 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
6067 goto END_REPEAT;
6068
6069 /* There is no sense in actually repeating assertions. The only potential
6070 use of repetition is in cases when the assertion is optional. Therefore,
6071 if the minimum is greater than zero, just ignore the repeat. If the
6072 maximum is not zero or one, set it to 1. */
6073
6074 if (*previous < OP_ONCE) /* Assertion */
6075 {
6076 if (repeat_min > 0) goto END_REPEAT;
6077 if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
6078 }
6079
6080 /* The case of a zero minimum is special because of the need to stick
6081 OP_BRAZERO in front of it, and because the group appears once in the
6082 data, whereas in other cases it appears the minimum number of times. For
6083 this reason, it is simplest to treat this case separately, as otherwise
6084 the code gets far too messy. There are several special subcases when the
6085 minimum is zero. */
6086
6087 if (repeat_min == 0)
6088 {
6089 /* If the maximum is also zero, we used to just omit the group from the
6090 output altogether, like this:
6091
6092 ** if (repeat_max == 0)
6093 ** {
6094 ** code = previous;
6095 ** goto END_REPEAT;
6096 ** }
6097
6098 However, that fails when a group or a subgroup within it is referenced
6099 as a subroutine from elsewhere in the pattern, so now we stick in
6100 OP_SKIPZERO in front of it so that it is skipped on execution. As we
6101 don't have a list of which groups are referenced, we cannot do this
6102 selectively.
6103
6104 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
6105 and do no more at this point. However, we do need to adjust any
6106 OP_RECURSE calls inside the group that refer to the group itself or any
6107 internal or forward referenced group, because the offset is from the
6108 start of the whole regex. Temporarily terminate the pattern while doing
6109 this. */
6110
6111 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
6112 {
6113 *code = OP_END;
6114 adjust_recurse(previous, 1, utf, cd, item_hwm_offset);
6115 memmove(previous + 1, previous, IN_UCHARS(len));
6116 code++;
6117 if (repeat_max == 0)
6118 {
6119 *previous++ = OP_SKIPZERO;
6120 goto END_REPEAT;
6121 }
6122 brazeroptr = previous; /* Save for possessive optimizing */
6123 *previous++ = OP_BRAZERO + repeat_type;
6124 }
6125
6126 /* If the maximum is greater than 1 and limited, we have to replicate
6127 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6128 The first one has to be handled carefully because it's the original
6129 copy, which has to be moved up. The remainder can be handled by code
6130 that is common with the non-zero minimum case below. We have to
6131 adjust the value or repeat_max, since one less copy is required. Once
6132 again, we may have to adjust any OP_RECURSE calls inside the group. */
6133
6134 else
6135 {
6136 int offset;
6137 *code = OP_END;
6138 adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset);
6139 memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
6140 code += 2 + LINK_SIZE;
6141 *previous++ = OP_BRAZERO + repeat_type;
6142 *previous++ = OP_BRA;
6143
6144 /* We chain together the bracket offset fields that have to be
6145 filled in later when the ends of the brackets are reached. */
6146
6147 offset = (bralink == NULL)? 0 : (int)(previous - bralink);
6148 bralink = previous;
6149 PUTINC(previous, 0, offset);
6150 }
6151
6152 repeat_max--;
6153 }
6154
6155 /* If the minimum is greater than zero, replicate the group as many
6156 times as necessary, and adjust the maximum to the number of subsequent
6157 copies that we need. If we set a first char from the group, and didn't
6158 set a required char, copy the latter from the former. If there are any
6159 forward reference subroutine calls in the group, there will be entries on
6160 the workspace list; replicate these with an appropriate increment. */
6161
6162 else
6163 {
6164 if (repeat_min > 1)
6165 {
6166 /* In the pre-compile phase, we don't actually do the replication. We
6167 just adjust the length as if we had. Do some paranoid checks for
6168 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6169 integer type when available, otherwise double. */
6170
6171 if (lengthptr != NULL)
6172 {
6173 int delta = (repeat_min - 1)*length_prevgroup;
6174 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
6175 (INT64_OR_DOUBLE)length_prevgroup >
6176 (INT64_OR_DOUBLE)INT_MAX ||
6177 OFLOW_MAX - *lengthptr < delta)
6178 {
6179 *errorcodeptr = ERR20;
6180 goto FAILED;
6181 }
6182 *lengthptr += delta;
6183 }
6184
6185 /* This is compiling for real. If there is a set first byte for
6186 the group, and we have not yet set a "required byte", set it. Make
6187 sure there is enough workspace for copying forward references before
6188 doing the copy. */
6189
6190 else
6191 {
6192 if (groupsetfirstchar && reqcharflags < 0)
6193 {
6194 reqchar = firstchar;
6195 reqcharflags = firstcharflags;
6196 }
6197
6198 for (i = 1; i < repeat_min; i++)
6199 {
6200 pcre_uchar *hc;
6201 size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6202 memcpy(code, previous, IN_UCHARS(len));
6203
6204 while (cd->hwm > cd->start_workspace + cd->workspace_size -
6205 WORK_SIZE_SAFETY_MARGIN -
6206 (this_hwm_offset - base_hwm_offset))
6207 {
6208 *errorcodeptr = expand_workspace(cd);
6209 if (*errorcodeptr != 0) goto FAILED;
6210 }
6211
6212 for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6213 hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6214 hc += LINK_SIZE)
6215 {
6216 PUT(cd->hwm, 0, GET(hc, 0) + len);
6217 cd->hwm += LINK_SIZE;
6218 }
6219 base_hwm_offset = this_hwm_offset;
6220 code += len;
6221 }
6222 }
6223 }
6224
6225 if (repeat_max > 0) repeat_max -= repeat_min;
6226 }
6227
6228 /* This code is common to both the zero and non-zero minimum cases. If
6229 the maximum is limited, it replicates the group in a nested fashion,
6230 remembering the bracket starts on a stack. In the case of a zero minimum,
6231 the first one was set up above. In all cases the repeat_max now specifies
6232 the number of additional copies needed. Again, we must remember to
6233 replicate entries on the forward reference list. */
6234
6235 if (repeat_max >= 0)
6236 {
6237 /* In the pre-compile phase, we don't actually do the replication. We
6238 just adjust the length as if we had. For each repetition we must add 1
6239 to the length for BRAZERO and for all but the last repetition we must
6240 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6241 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
6242 a 64-bit integer type when available, otherwise double. */
6243
6244 if (lengthptr != NULL && repeat_max > 0)
6245 {
6246 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6247 2 - 2*LINK_SIZE; /* Last one doesn't nest */
6248 if ((INT64_OR_DOUBLE)repeat_max *
6249 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6250 > (INT64_OR_DOUBLE)INT_MAX ||
6251 OFLOW_MAX - *lengthptr < delta)
6252 {
6253 *errorcodeptr = ERR20;
6254 goto FAILED;
6255 }
6256 *lengthptr += delta;
6257 }
6258
6259 /* This is compiling for real */
6260
6261 else for (i = repeat_max - 1; i >= 0; i--)
6262 {
6263 pcre_uchar *hc;
6264 size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6265
6266 *code++ = OP_BRAZERO + repeat_type;
6267
6268 /* All but the final copy start a new nesting, maintaining the
6269 chain of brackets outstanding. */
6270
6271 if (i != 0)
6272 {
6273 int offset;
6274 *code++ = OP_BRA;
6275 offset = (bralink == NULL)? 0 : (int)(code - bralink);
6276 bralink = code;
6277 PUTINC(code, 0, offset);
6278 }
6279
6280 memcpy(code, previous, IN_UCHARS(len));
6281
6282 /* Ensure there is enough workspace for forward references before
6283 copying them. */
6284
6285 while (cd->hwm > cd->start_workspace + cd->workspace_size -
6286 WORK_SIZE_SAFETY_MARGIN -
6287 (this_hwm_offset - base_hwm_offset))
6288 {
6289 *errorcodeptr = expand_workspace(cd);
6290 if (*errorcodeptr != 0) goto FAILED;
6291 }
6292
6293 for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6294 hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6295 hc += LINK_SIZE)
6296 {
6297 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
6298 cd->hwm += LINK_SIZE;
6299 }
6300 base_hwm_offset = this_hwm_offset;
6301 code += len;
6302 }
6303
6304 /* Now chain through the pending brackets, and fill in their length
6305 fields (which are holding the chain links pro tem). */
6306
6307 while (bralink != NULL)
6308 {
6309 int oldlinkoffset;
6310 int offset = (int)(code - bralink + 1);
6311 pcre_uchar *bra = code - offset;
6312 oldlinkoffset = GET(bra, 1);
6313 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6314 *code++ = OP_KET;
6315 PUTINC(code, 0, offset);
6316 PUT(bra, 1, offset);
6317 }
6318 }
6319
6320 /* If the maximum is unlimited, set a repeater in the final copy. For
6321 ONCE brackets, that's all we need to do. However, possessively repeated
6322 ONCE brackets can be converted into non-capturing brackets, as the
6323 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6324 deal with possessive ONCEs specially.
6325
6326 Otherwise, when we are doing the actual compile phase, check to see
6327 whether this group is one that could match an empty string. If so,
6328 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6329 that runtime checking can be done. [This check is also applied to ONCE
6330 groups at runtime, but in a different way.]
6331
6332 Then, if the quantifier was possessive and the bracket is not a
6333 conditional, we convert the BRA code to the POS form, and the KET code to
6334 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6335 subpattern at both the start and at the end.) The use of special opcodes
6336 makes it possible to reduce greatly the stack usage in pcre_exec(). If
6337 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6338
6339 Then, if the minimum number of matches is 1 or 0, cancel the possessive
6340 flag so that the default action below, of wrapping everything inside
6341 atomic brackets, does not happen. When the minimum is greater than 1,
6342 there will be earlier copies of the group, and so we still have to wrap
6343 the whole thing. */
6344
6345 else
6346 {
6347 pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6348 pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6349
6350 /* Convert possessive ONCE brackets to non-capturing */
6351
6352 if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
6353 possessive_quantifier) *bracode = OP_BRA;
6354
6355 /* For non-possessive ONCE brackets, all we need to do is to
6356 set the KET. */
6357
6358 if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
6359 *ketcode = OP_KETRMAX + repeat_type;
6360
6361 /* Handle non-ONCE brackets and possessive ONCEs (which have been
6362 converted to non-capturing above). */
6363
6364 else
6365 {
6366 /* In the compile phase, check for empty string matching. */
6367
6368 if (lengthptr == NULL)
6369 {
6370 pcre_uchar *scode = bracode;
6371 do
6372 {
6373 if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6374 {
6375 *bracode += OP_SBRA - OP_BRA;
6376 break;
6377 }
6378 scode += GET(scode, 1);
6379 }
6380 while (*scode == OP_ALT);
6381 }
6382
6383 /* A conditional group with only one branch has an implicit empty
6384 alternative branch. */
6385
6386 if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
6387 *bracode = OP_SCOND;
6388
6389 /* Handle possessive quantifiers. */
6390
6391 if (possessive_quantifier)
6392 {
6393 /* For COND brackets, we wrap the whole thing in a possessively
6394 repeated non-capturing bracket, because we have not invented POS
6395 versions of the COND opcodes. Because we are moving code along, we
6396 must ensure that any pending recursive references are updated. */
6397
6398 if (*bracode == OP_COND || *bracode == OP_SCOND)
6399 {
6400 int nlen = (int)(code - bracode);
6401 *code = OP_END;
6402 adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6403 memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6404 code += 1 + LINK_SIZE;
6405 nlen += 1 + LINK_SIZE;
6406 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
6407 *code++ = OP_KETRPOS;
6408 PUTINC(code, 0, nlen);
6409 PUT(bracode, 1, nlen);
6410 }
6411
6412 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6413
6414 else
6415 {
6416 *bracode += 1; /* Switch to xxxPOS opcodes */
6417 *ketcode = OP_KETRPOS;
6418 }
6419
6420 /* If the minimum is zero, mark it as possessive, then unset the
6421 possessive flag when the minimum is 0 or 1. */
6422
6423 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6424 if (repeat_min < 2) possessive_quantifier = FALSE;
6425 }
6426
6427 /* Non-possessive quantifier */
6428
6429 else *ketcode = OP_KETRMAX + repeat_type;
6430 }
6431 }
6432 }
6433
6434 /* If previous is OP_FAIL, it was generated by an empty class [] in
6435 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6436 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6437 error above. We can just ignore the repeat in JS case. */
6438
6439 else if (*previous == OP_FAIL) goto END_REPEAT;
6440
6441 /* Else there's some kind of shambles */
6442
6443 else
6444 {
6445 *errorcodeptr = ERR11;
6446 goto FAILED;
6447 }
6448
6449 /* If the character following a repeat is '+', possessive_quantifier is
6450 TRUE. For some opcodes, there are special alternative opcodes for this
6451 case. For anything else, we wrap the entire repeated item inside OP_ONCE
6452 brackets. Logically, the '+' notation is just syntactic sugar, taken from
6453 Sun's Java package, but the special opcodes can optimize it.
6454
6455 Some (but not all) possessively repeated subpatterns have already been
6456 completely handled in the code just above. For them, possessive_quantifier
6457 is always FALSE at this stage. Note that the repeated item starts at
6458 tempcode, not at previous, which might be the first part of a string whose
6459 (former) last char we repeated. */
6460
6461 if (possessive_quantifier)
6462 {
6463 int len;
6464
6465 /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6466 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6467 {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6468 remains is greater than zero, there's a further opcode that can be
6469 handled. If not, do nothing, leaving the EXACT alone. */
6470
6471 switch(*tempcode)
6472 {
6473 case OP_TYPEEXACT:
6474 tempcode += PRIV(OP_lengths)[*tempcode] +
6475 ((tempcode[1 + IMM2_SIZE] == OP_PROP
6476 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6477 break;
6478
6479 /* CHAR opcodes are used for exacts whose count is 1. */
6480
6481 case OP_CHAR:
6482 case OP_CHARI:
6483 case OP_NOT:
6484 case OP_NOTI:
6485 case OP_EXACT:
6486 case OP_EXACTI:
6487 case OP_NOTEXACT:
6488 case OP_NOTEXACTI:
6489 tempcode += PRIV(OP_lengths)[*tempcode];
6490#ifdef SUPPORT_UTF
6491 if (utf && HAS_EXTRALEN(tempcode[-1]))
6492 tempcode += GET_EXTRALEN(tempcode[-1]);
6493#endif
6494 break;
6495
6496 /* For the class opcodes, the repeat operator appears at the end;
6497 adjust tempcode to point to it. */
6498
6499 case OP_CLASS:
6500 case OP_NCLASS:
6501 tempcode += 1 + 32/sizeof(pcre_uchar);
6502 break;
6503
6504#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6505 case OP_XCLASS:
6506 tempcode += GET(tempcode, 1);
6507 break;
6508#endif
6509 }
6510
6511 /* If tempcode is equal to code (which points to the end of the repeated
6512 item), it means we have skipped an EXACT item but there is no following
6513 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6514 all other cases, tempcode will be pointing to the repeat opcode, and will
6515 be less than code, so the value of len will be greater than 0. */
6516
6517 len = (int)(code - tempcode);
6518 if (len > 0)
6519 {
6520 unsigned int repcode = *tempcode;
6521
6522 /* There is a table for possessifying opcodes, all of which are less
6523 than OP_CALLOUT. A zero entry means there is no possessified version.
6524 */
6525
6526 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6527 *tempcode = opcode_possessify[repcode];
6528
6529 /* For opcode without a special possessified version, wrap the item in
6530 ONCE brackets. Because we are moving code along, we must ensure that any
6531 pending recursive references are updated. */
6532
6533 else
6534 {
6535 *code = OP_END;
6536 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6537 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6538 code += 1 + LINK_SIZE;
6539 len += 1 + LINK_SIZE;
6540 tempcode[0] = OP_ONCE;
6541 *code++ = OP_KET;
6542 PUTINC(code, 0, len);
6543 PUT(tempcode, 1, len);
6544 }
6545 }
6546
6547#ifdef NEVER
6548 if (len > 0) switch (*tempcode)
6549 {
6550 case OP_STAR: *tempcode = OP_POSSTAR; break;
6551 case OP_PLUS: *tempcode = OP_POSPLUS; break;
6552 case OP_QUERY: *tempcode = OP_POSQUERY; break;
6553 case OP_UPTO: *tempcode = OP_POSUPTO; break;
6554
6555 case OP_STARI: *tempcode = OP_POSSTARI; break;
6556 case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
6557 case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6558 case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
6559
6560 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
6561 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
6562 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6563 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
6564
6565 case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
6566 case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
6567 case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6568 case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
6569
6570 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
6571 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
6572 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6573 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
6574
6575 case OP_CRSTAR: *tempcode = OP_CRPOSSTAR; break;
6576 case OP_CRPLUS: *tempcode = OP_CRPOSPLUS; break;
6577 case OP_CRQUERY: *tempcode = OP_CRPOSQUERY; break;
6578 case OP_CRRANGE: *tempcode = OP_CRPOSRANGE; break;
6579
6580 /* Because we are moving code along, we must ensure that any
6581 pending recursive references are updated. */
6582
6583 default:
6584 *code = OP_END;
6585 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6586 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6587 code += 1 + LINK_SIZE;
6588 len += 1 + LINK_SIZE;
6589 tempcode[0] = OP_ONCE;
6590 *code++ = OP_KET;
6591 PUTINC(code, 0, len);
6592 PUT(tempcode, 1, len);
6593 break;
6594 }
6595#endif
6596 }
6597
6598 /* In all case we no longer have a previous item. We also set the
6599 "follows varying string" flag for subsequently encountered reqchars if
6600 it isn't already set and we have just passed a varying length item. */
6601
6602 END_REPEAT:
6603 previous = NULL;
6604 cd->req_varyopt |= reqvary;
6605 break;
6606
6607
6608 /* ===================================================================*/
6609 /* Start of nested parenthesized sub-expression, or comment or lookahead or
6610 lookbehind or option setting or condition or all the other extended
6611 parenthesis forms. */
6612
6613 case CHAR_LEFT_PARENTHESIS:
6614 ptr++;
6615
6616 /* Now deal with various "verbs" that can be introduced by '*'. */
6617
6618 if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6619 || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6620 {
6621 int i, namelen;
6622 int arglen = 0;
6623 const char *vn = verbnames;
6624 const pcre_uchar *name = ptr + 1;
6625 const pcre_uchar *arg = NULL;
6626 previous = NULL;
6627 ptr++;
6628 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6629 namelen = (int)(ptr - name);
6630
6631 /* It appears that Perl allows any characters whatsoever, other than
6632 a closing parenthesis, to appear in arguments, so we no longer insist on
6633 letters, digits, and underscores. */
6634
6635 if (*ptr == CHAR_COLON)
6636 {
6637 arg = ++ptr;
6638 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6639 arglen = (int)(ptr - arg);
6640 if ((unsigned int)arglen > MAX_MARK)
6641 {
6642 *errorcodeptr = ERR75;
6643 goto FAILED;
6644 }
6645 }
6646
6647 if (*ptr != CHAR_RIGHT_PARENTHESIS)
6648 {
6649 *errorcodeptr = ERR60;
6650 goto FAILED;
6651 }
6652
6653 /* Scan the table of verb names */
6654
6655 for (i = 0; i < verbcount; i++)
6656 {
6657 if (namelen == verbs[i].len &&
6658 STRNCMP_UC_C8(name, vn, namelen) == 0)
6659 {
6660 int setverb;
6661
6662 /* Check for open captures before ACCEPT and convert it to
6663 ASSERT_ACCEPT if in an assertion. */
6664
6665 if (verbs[i].op == OP_ACCEPT)
6666 {
6667 open_capitem *oc;
6668 if (arglen != 0)
6669 {
6670 *errorcodeptr = ERR59;
6671 goto FAILED;
6672 }
6673 cd->had_accept = TRUE;
6674 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6675 {
6676 if (lengthptr != NULL)
6677 {
6678#ifdef COMPILE_PCRE8
6679 *lengthptr += 1 + IMM2_SIZE;
6680#elif defined COMPILE_PCRE16
6681 *lengthptr += 2 + IMM2_SIZE;
6682#elif defined COMPILE_PCRE32
6683 *lengthptr += 4 + IMM2_SIZE;
6684#endif
6685 }
6686 else
6687 {
6688 *code++ = OP_CLOSE;
6689 PUT2INC(code, 0, oc->number);
6690 }
6691 }
6692 setverb = *code++ =
6693 (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6694
6695 /* Do not set firstchar after *ACCEPT */
6696 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6697 }
6698
6699 /* Handle other cases with/without an argument */
6700
6701 else if (arglen == 0)
6702 {
6703 if (verbs[i].op < 0) /* Argument is mandatory */
6704 {
6705 *errorcodeptr = ERR66;
6706 goto FAILED;
6707 }
6708 setverb = *code++ = verbs[i].op;
6709 }
6710
6711 else
6712 {
6713 if (verbs[i].op_arg < 0) /* Argument is forbidden */
6714 {
6715 *errorcodeptr = ERR59;
6716 goto FAILED;
6717 }
6718 setverb = *code++ = verbs[i].op_arg;
6719 if (lengthptr != NULL) /* In pass 1 just add in the length */
6720 { /* to avoid potential workspace */
6721 *lengthptr += arglen; /* overflow. */
6722 *code++ = 0;
6723 }
6724 else
6725 {
6726 *code++ = arglen;
6727 memcpy(code, arg, IN_UCHARS(arglen));
6728 code += arglen;
6729 }
6730 *code++ = 0;
6731 }
6732
6733 switch (setverb)
6734 {
6735 case OP_THEN:
6736 case OP_THEN_ARG:
6737 cd->external_flags |= PCRE_HASTHEN;
6738 break;
6739
6740 case OP_PRUNE:
6741 case OP_PRUNE_ARG:
6742 case OP_SKIP:
6743 case OP_SKIP_ARG:
6744 cd->had_pruneorskip = TRUE;
6745 break;
6746 }
6747
6748 break; /* Found verb, exit loop */
6749 }
6750
6751 vn += verbs[i].len + 1;
6752 }
6753
6754 if (i < verbcount) continue; /* Successfully handled a verb */
6755 *errorcodeptr = ERR60; /* Verb not recognized */
6756 goto FAILED;
6757 }
6758
6759 /* Initialize for "real" parentheses */
6760
6761 newoptions = options;
6762 skipbytes = 0;
6763 bravalue = OP_CBRA;
6764 item_hwm_offset = cd->hwm - cd->start_workspace;
6765 reset_bracount = FALSE;
6766
6767 /* Deal with the extended parentheses; all are introduced by '?', and the
6768 appearance of any of them means that this is not a capturing group. */
6769
6770 if (*ptr == CHAR_QUESTION_MARK)
6771 {
6772 int i, set, unset, namelen;
6773 int *optset;
6774 const pcre_uchar *name;
6775 pcre_uchar *slot;
6776
6777 switch (*(++ptr))
6778 {
6779 /* ------------------------------------------------------------ */
6780 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
6781 reset_bracount = TRUE;
6782 cd->dupgroups = TRUE; /* Record (?| encountered */
6783 /* Fall through */
6784
6785 /* ------------------------------------------------------------ */
6786 case CHAR_COLON: /* Non-capturing bracket */
6787 bravalue = OP_BRA;
6788 ptr++;
6789 break;
6790
6791
6792 /* ------------------------------------------------------------ */
6793 case CHAR_LEFT_PARENTHESIS:
6794 bravalue = OP_COND; /* Conditional group */
6795 tempptr = ptr;
6796
6797 /* A condition can be an assertion, a number (referring to a numbered
6798 group's having been set), a name (referring to a named group), or 'R',
6799 referring to recursion. R<digits> and R&name are also permitted for
6800 recursion tests.
6801
6802 There are ways of testing a named group: (?(name)) is used by Python;
6803 Perl 5.10 onwards uses (?(<name>) or (?('name')).
6804
6805 There is one unfortunate ambiguity, caused by history. 'R' can be the
6806 recursive thing or the name 'R' (and similarly for 'R' followed by
6807 digits). We look for a name first; if not found, we try the other case.
6808
6809 For compatibility with auto-callouts, we allow a callout to be
6810 specified before a condition that is an assertion. First, check for the
6811 syntax of a callout; if found, adjust the temporary pointer that is
6812 used to check for an assertion condition. That's all that is needed! */
6813
6814 if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6815 {
6816 for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6817 if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6818 tempptr += i + 1;
6819
6820 /* tempptr should now be pointing to the opening parenthesis of the
6821 assertion condition. */
6822
6823 if (*tempptr != CHAR_LEFT_PARENTHESIS)
6824 {
6825 *errorcodeptr = ERR28;
6826 goto FAILED;
6827 }
6828 }
6829
6830 /* For conditions that are assertions, check the syntax, and then exit
6831 the switch. This will take control down to where bracketed groups,
6832 including assertions, are processed. */
6833
6834 if (tempptr[1] == CHAR_QUESTION_MARK &&
6835 (tempptr[2] == CHAR_EQUALS_SIGN ||
6836 tempptr[2] == CHAR_EXCLAMATION_MARK ||
6837 (tempptr[2] == CHAR_LESS_THAN_SIGN &&
6838 (tempptr[3] == CHAR_EQUALS_SIGN ||
6839 tempptr[3] == CHAR_EXCLAMATION_MARK))))
6840 {
6841 cd->iscondassert = TRUE;
6842 break;
6843 }
6844
6845 /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6846 need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6847
6848 code[1+LINK_SIZE] = OP_CREF;
6849 skipbytes = 1+IMM2_SIZE;
6850 refsign = -1; /* => not a number */
6851 namelen = -1; /* => not a name; must set to avoid warning */
6852 name = NULL; /* Always set to avoid warning */
6853 recno = 0; /* Always set to avoid warning */
6854
6855 /* Check for a test for recursion in a named group. */
6856
6857 ptr++;
6858 if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6859 {
6860 terminator = -1;
6861 ptr += 2;
6862 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
6863 }
6864
6865 /* Check for a test for a named group's having been set, using the Perl
6866 syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6867 syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6868
6869 else if (*ptr == CHAR_LESS_THAN_SIGN)
6870 {
6871 terminator = CHAR_GREATER_THAN_SIGN;
6872 ptr++;
6873 }
6874 else if (*ptr == CHAR_APOSTROPHE)
6875 {
6876 terminator = CHAR_APOSTROPHE;
6877 ptr++;
6878 }
6879 else
6880 {
6881 terminator = CHAR_NULL;
6882 if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6883 else if (IS_DIGIT(*ptr)) refsign = 0;
6884 }
6885
6886 /* Handle a number */
6887
6888 if (refsign >= 0)
6889 {
6890 while (IS_DIGIT(*ptr))
6891 {
6892 if (recno > INT_MAX / 10 - 1) /* Integer overflow */
6893 {
6894 while (IS_DIGIT(*ptr)) ptr++;
6895 *errorcodeptr = ERR61;
6896 goto FAILED;
6897 }
6898 recno = recno * 10 + (int)(*ptr - CHAR_0);
6899 ptr++;
6900 }
6901 }
6902
6903 /* Otherwise we expect to read a name; anything else is an error. When
6904 a name is one of a number of duplicates, a different opcode is used and
6905 it needs more memory. Unfortunately we cannot tell whether a name is a
6906 duplicate in the first pass, so we have to allow for more memory. */
6907
6908 else
6909 {
6910 if (IS_DIGIT(*ptr))
6911 {
6912 *errorcodeptr = ERR84;
6913 goto FAILED;
6914 }
6915 if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6916 {
6917 *errorcodeptr = ERR28; /* Assertion expected */
6918 goto FAILED;
6919 }
6920 name = ptr++;
6921 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6922 {
6923 ptr++;
6924 }
6925 namelen = (int)(ptr - name);
6926 if (lengthptr != NULL) skipbytes += IMM2_SIZE;
6927 }
6928
6929 /* Check the terminator */
6930
6931 if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6932 *ptr++ != CHAR_RIGHT_PARENTHESIS)
6933 {
6934 ptr--; /* Error offset */
6935 *errorcodeptr = ERR26; /* Malformed number or name */
6936 goto FAILED;
6937 }
6938
6939 /* Do no further checking in the pre-compile phase. */
6940
6941 if (lengthptr != NULL) break;
6942
6943 /* In the real compile we do the work of looking for the actual
6944 reference. If refsign is not negative, it means we have a number in
6945 recno. */
6946
6947 if (refsign >= 0)
6948 {
6949 if (recno <= 0)
6950 {
6951 *errorcodeptr = ERR35;
6952 goto FAILED;
6953 }
6954 if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6955 cd->bracount - recno + 1 : recno + cd->bracount;
6956 if (recno <= 0 || recno > cd->final_bracount)
6957 {
6958 *errorcodeptr = ERR15;
6959 goto FAILED;
6960 }
6961 PUT2(code, 2+LINK_SIZE, recno);
6962 if (recno > cd->top_backref) cd->top_backref = recno;
6963 break;
6964 }
6965
6966 /* Otherwise look for the name. */
6967
6968 slot = cd->name_table;
6969 for (i = 0; i < cd->names_found; i++)
6970 {
6971 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6972 slot[IMM2_SIZE+namelen] == 0) break;
6973 slot += cd->name_entry_size;
6974 }
6975
6976 /* Found the named subpattern. If the name is duplicated, add one to
6977 the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6978 appropriate data values. Otherwise, just insert the unique subpattern
6979 number. */
6980
6981 if (i < cd->names_found)
6982 {
6983 int offset = i++;
6984 int count = 1;
6985 recno = GET2(slot, 0); /* Number from first found */
6986 if (recno > cd->top_backref) cd->top_backref = recno;
6987 for (; i < cd->names_found; i++)
6988 {
6989 slot += cd->name_entry_size;
6990 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
6991 (slot+IMM2_SIZE)[namelen] != 0) break;
6992 count++;
6993 }
6994
6995 if (count > 1)
6996 {
6997 PUT2(code, 2+LINK_SIZE, offset);
6998 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6999 skipbytes += IMM2_SIZE;
7000 code[1+LINK_SIZE]++;
7001 }
7002 else /* Not a duplicated name */
7003 {
7004 PUT2(code, 2+LINK_SIZE, recno);
7005 }
7006 }
7007
7008 /* If terminator == CHAR_NULL it means that the name followed directly
7009 after the opening parenthesis [e.g. (?(abc)...] and in this case there
7010 are some further alternatives to try. For the cases where terminator !=
7011 CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
7012 we have now checked all the possibilities, so give an error. */
7013
7014 else if (terminator != CHAR_NULL)
7015 {
7016 *errorcodeptr = ERR15;
7017 goto FAILED;
7018 }
7019
7020 /* Check for (?(R) for recursion. Allow digits after R to specify a
7021 specific group number. */
7022
7023 else if (*name == CHAR_R)
7024 {
7025 recno = 0;
7026 for (i = 1; i < namelen; i++)
7027 {
7028 if (!IS_DIGIT(name[i]))
7029 {
7030 *errorcodeptr = ERR15;
7031 goto FAILED;
7032 }
7033 if (recno > INT_MAX / 10 - 1) /* Integer overflow */
7034 {
7035 *errorcodeptr = ERR61;
7036 goto FAILED;
7037 }
7038 recno = recno * 10 + name[i] - CHAR_0;
7039 }
7040 if (recno == 0) recno = RREF_ANY;
7041 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
7042 PUT2(code, 2+LINK_SIZE, recno);
7043 }
7044
7045 /* Similarly, check for the (?(DEFINE) "condition", which is always
7046 false. */
7047
7048 else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
7049 {
7050 code[1+LINK_SIZE] = OP_DEF;
7051 skipbytes = 1;
7052 }
7053
7054 /* Reference to an unidentified subpattern. */
7055
7056 else
7057 {
7058 *errorcodeptr = ERR15;
7059 goto FAILED;
7060 }
7061 break;
7062
7063
7064 /* ------------------------------------------------------------ */
7065 case CHAR_EQUALS_SIGN: /* Positive lookahead */
7066 bravalue = OP_ASSERT;
7067 cd->assert_depth += 1;
7068 ptr++;
7069 break;
7070
7071 /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
7072 thing to do, but Perl allows all assertions to be quantified, and when
7073 they contain capturing parentheses there may be a potential use for
7074 this feature. Not that that applies to a quantified (?!) but we allow
7075 it for uniformity. */
7076
7077 /* ------------------------------------------------------------ */
7078 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
7079 ptr++;
7080 if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
7081 ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
7082 (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
7083 {
7084 *code++ = OP_FAIL;
7085 previous = NULL;
7086 continue;
7087 }
7088 bravalue = OP_ASSERT_NOT;
7089 cd->assert_depth += 1;
7090 break;
7091
7092
7093 /* ------------------------------------------------------------ */
7094 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
7095 switch (ptr[1])
7096 {
7097 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
7098 bravalue = OP_ASSERTBACK;
7099 cd->assert_depth += 1;
7100 ptr += 2;
7101 break;
7102
7103 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
7104 bravalue = OP_ASSERTBACK_NOT;
7105 cd->assert_depth += 1;
7106 ptr += 2;
7107 break;
7108
7109 default: /* Could be name define, else bad */
7110 if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
7111 goto DEFINE_NAME;
7112 ptr++; /* Correct offset for error */
7113 *errorcodeptr = ERR24;
7114 goto FAILED;
7115 }
7116 break;
7117
7118
7119 /* ------------------------------------------------------------ */
7120 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
7121 bravalue = OP_ONCE;
7122 ptr++;
7123 break;
7124
7125
7126 /* ------------------------------------------------------------ */
7127 case CHAR_C: /* Callout - may be followed by digits; */
7128 previous_callout = code; /* Save for later completion */
7129 after_manual_callout = 1; /* Skip one item before completing */
7130 *code++ = OP_CALLOUT;
7131 {
7132 int n = 0;
7133 ptr++;
7134 while(IS_DIGIT(*ptr))
7135 n = n * 10 + *ptr++ - CHAR_0;
7136 if (*ptr != CHAR_RIGHT_PARENTHESIS)
7137 {
7138 *errorcodeptr = ERR39;
7139 goto FAILED;
7140 }
7141 if (n > 255)
7142 {
7143 *errorcodeptr = ERR38;
7144 goto FAILED;
7145 }
7146 *code++ = n;
7147 PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
7148 PUT(code, LINK_SIZE, 0); /* Default length */
7149 code += 2 * LINK_SIZE;
7150 }
7151 previous = NULL;
7152 continue;
7153
7154
7155 /* ------------------------------------------------------------ */
7156 case CHAR_P: /* Python-style named subpattern handling */
7157 if (*(++ptr) == CHAR_EQUALS_SIGN ||
7158 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
7159 {
7160 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
7161 terminator = CHAR_RIGHT_PARENTHESIS;
7162 goto NAMED_REF_OR_RECURSE;
7163 }
7164 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
7165 {
7166 *errorcodeptr = ERR41;
7167 goto FAILED;
7168 }
7169 /* Fall through to handle (?P< as (?< is handled */
7170
7171
7172 /* ------------------------------------------------------------ */
7173 DEFINE_NAME: /* Come here from (?< handling */
7174 case CHAR_APOSTROPHE:
7175 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
7176 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7177 name = ++ptr;
7178 if (IS_DIGIT(*ptr))
7179 {
7180 *errorcodeptr = ERR84; /* Group name must start with non-digit */
7181 goto FAILED;
7182 }
7183 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7184 namelen = (int)(ptr - name);
7185
7186 /* In the pre-compile phase, do a syntax check, remember the longest
7187 name, and then remember the group in a vector, expanding it if
7188 necessary. Duplicates for the same number are skipped; other duplicates
7189 are checked for validity. In the actual compile, there is nothing to
7190 do. */
7191
7192 if (lengthptr != NULL)
7193 {
7194 named_group *ng;
7195 pcre_uint32 number = cd->bracount + 1;
7196
7197 if (*ptr != (pcre_uchar)terminator)
7198 {
7199 *errorcodeptr = ERR42;
7200 goto FAILED;
7201 }
7202
7203 if (cd->names_found >= MAX_NAME_COUNT)
7204 {
7205 *errorcodeptr = ERR49;
7206 goto FAILED;
7207 }
7208
7209 if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
7210 {
7211 cd->name_entry_size = namelen + IMM2_SIZE + 1;
7212 if (namelen > MAX_NAME_SIZE)
7213 {
7214 *errorcodeptr = ERR48;
7215 goto FAILED;
7216 }
7217 }
7218
7219 /* Scan the list to check for duplicates. For duplicate names, if the
7220 number is the same, break the loop, which causes the name to be
7221 discarded; otherwise, if DUPNAMES is not set, give an error.
7222 If it is set, allow the name with a different number, but continue
7223 scanning in case this is a duplicate with the same number. For
7224 non-duplicate names, give an error if the number is duplicated. */
7225
7226 ng = cd->named_groups;
7227 for (i = 0; i < cd->names_found; i++, ng++)
7228 {
7229 if (namelen == ng->length &&
7230 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7231 {
7232 if (ng->number == number) break;
7233 if ((options & PCRE_DUPNAMES) == 0)
7234 {
7235 *errorcodeptr = ERR43;
7236 goto FAILED;
7237 }
7238 cd->dupnames = TRUE; /* Duplicate names exist */
7239 }
7240 else if (ng->number == number)
7241 {
7242 *errorcodeptr = ERR65;
7243 goto FAILED;
7244 }
7245 }
7246
7247 if (i >= cd->names_found) /* Not a duplicate with same number */
7248 {
7249 /* Increase the list size if necessary */
7250
7251 if (cd->names_found >= cd->named_group_list_size)
7252 {
7253 int newsize = cd->named_group_list_size * 2;
7254 named_group *newspace = (PUBL(malloc))
7255 (newsize * sizeof(named_group));
7256
7257 if (newspace == NULL)
7258 {
7259 *errorcodeptr = ERR21;
7260 goto FAILED;
7261 }
7262
7263 memcpy(newspace, cd->named_groups,
7264 cd->named_group_list_size * sizeof(named_group));
7265 if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
7266 (PUBL(free))((void *)cd->named_groups);
7267 cd->named_groups = newspace;
7268 cd->named_group_list_size = newsize;
7269 }
7270
7271 cd->named_groups[cd->names_found].name = name;
7272 cd->named_groups[cd->names_found].length = namelen;
7273 cd->named_groups[cd->names_found].number = number;
7274 cd->names_found++;
7275 }
7276 }
7277
7278 ptr++; /* Move past > or ' in both passes. */
7279 goto NUMBERED_GROUP;
7280
7281
7282 /* ------------------------------------------------------------ */
7283 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
7284 terminator = CHAR_RIGHT_PARENTHESIS;
7285 is_recurse = TRUE;
7286 /* Fall through */
7287
7288 /* We come here from the Python syntax above that handles both
7289 references (?P=name) and recursion (?P>name), as well as falling
7290 through from the Perl recursion syntax (?&name). We also come here from
7291 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
7292 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
7293
7294 NAMED_REF_OR_RECURSE:
7295 name = ++ptr;
7296 if (IS_DIGIT(*ptr))
7297 {
7298 *errorcodeptr = ERR84; /* Group name must start with non-digit */
7299 goto FAILED;
7300 }
7301 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7302 namelen = (int)(ptr - name);
7303
7304 /* In the pre-compile phase, do a syntax check. We used to just set
7305 a dummy reference number, because it was not used in the first pass.
7306 However, with the change of recursive back references to be atomic,
7307 we have to look for the number so that this state can be identified, as
7308 otherwise the incorrect length is computed. If it's not a backwards
7309 reference, the dummy number will do. */
7310
7311 if (lengthptr != NULL)
7312 {
7313 named_group *ng;
7314 recno = 0;
7315
7316 if (namelen == 0)
7317 {
7318 *errorcodeptr = ERR62;
7319 goto FAILED;
7320 }
7321 if (*ptr != (pcre_uchar)terminator)
7322 {
7323 *errorcodeptr = ERR42;
7324 goto FAILED;
7325 }
7326 if (namelen > MAX_NAME_SIZE)
7327 {
7328 *errorcodeptr = ERR48;
7329 goto FAILED;
7330 }
7331
7332 /* Count named back references. */
7333
7334 if (!is_recurse) cd->namedrefcount++;
7335
7336 /* We have to allow for a named reference to a duplicated name (this
7337 cannot be determined until the second pass). This needs an extra
7338 16-bit data item. */
7339
7340 *lengthptr += IMM2_SIZE;
7341
7342 /* If this is a forward reference and we are within a (?|...) group,
7343 the reference may end up as the number of a group which we are
7344 currently inside, that is, it could be a recursive reference. In the
7345 real compile this will be picked up and the reference wrapped with
7346 OP_ONCE to make it atomic, so we must space in case this occurs. */
7347
7348 /* In fact, this can happen for a non-forward reference because
7349 another group with the same number might be created later. This
7350 issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
7351 only mode, we finesse the bug by allowing more memory always. */
7352
7353 *lengthptr += 4 + 4*LINK_SIZE;
7354
7355 /* It is even worse than that. The current reference may be to an
7356 existing named group with a different number (so apparently not
7357 recursive) but which later on is also attached to a group with the
7358 current number. This can only happen if $(| has been previous
7359 encountered. In that case, we allow yet more memory, just in case.
7360 (Again, this is fixed "properly" in PCRE2. */
7361
7362 if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE;
7363
7364 /* Otherwise, check for recursion here. The name table does not exist
7365 in the first pass; instead we must scan the list of names encountered
7366 so far in order to get the number. If the name is not found, leave
7367 the value of recno as 0 for a forward reference. */
7368
7369 /* This patch (removing "else") fixes a problem when a reference is
7370 to multiple identically named nested groups from within the nest.
7371 Once again, it is not the "proper" fix, and it results in an
7372 over-allocation of memory. */
7373
7374 /* else */
7375 {
7376 ng = cd->named_groups;
7377 for (i = 0; i < cd->names_found; i++, ng++)
7378 {
7379 if (namelen == ng->length &&
7380 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7381 {
7382 open_capitem *oc;
7383 recno = ng->number;
7384 if (is_recurse) break;
7385 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7386 {
7387 if (oc->number == recno)
7388 {
7389 oc->flag = TRUE;
7390 break;
7391 }
7392 }
7393 }
7394 }
7395 }
7396 }
7397
7398 /* In the real compile, search the name table. We check the name
7399 first, and then check that we have reached the end of the name in the
7400 table. That way, if the name is longer than any in the table, the
7401 comparison will fail without reading beyond the table entry. */
7402
7403 else
7404 {
7405 slot = cd->name_table;
7406 for (i = 0; i < cd->names_found; i++)
7407 {
7408 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
7409 slot[IMM2_SIZE+namelen] == 0)
7410 break;
7411 slot += cd->name_entry_size;
7412 }
7413
7414 if (i < cd->names_found)
7415 {
7416 recno = GET2(slot, 0);
7417 }
7418 else
7419 {
7420 *errorcodeptr = ERR15;
7421 goto FAILED;
7422 }
7423 }
7424
7425 /* In both phases, for recursions, we can now go to the code than
7426 handles numerical recursion. */
7427
7428 if (is_recurse) goto HANDLE_RECURSION;
7429
7430 /* In the second pass we must see if the name is duplicated. If so, we
7431 generate a different opcode. */
7432
7433 if (lengthptr == NULL && cd->dupnames)
7434 {
7435 int count = 1;
7436 unsigned int index = i;
7437 pcre_uchar *cslot = slot + cd->name_entry_size;
7438
7439 for (i++; i < cd->names_found; i++)
7440 {
7441 if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
7442 count++;
7443 cslot += cd->name_entry_size;
7444 }
7445
7446 if (count > 1)
7447 {
7448 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7449 previous = code;
7450 item_hwm_offset = cd->hwm - cd->start_workspace;
7451 *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7452 PUT2INC(code, 0, index);
7453 PUT2INC(code, 0, count);
7454
7455 /* Process each potentially referenced group. */
7456
7457 for (; slot < cslot; slot += cd->name_entry_size)
7458 {
7459 open_capitem *oc;
7460 recno = GET2(slot, 0);
7461 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
7462 if (recno > cd->top_backref) cd->top_backref = recno;
7463
7464 /* Check to see if this back reference is recursive, that it, it
7465 is inside the group that it references. A flag is set so that the
7466 group can be made atomic. */
7467
7468 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7469 {
7470 if (oc->number == recno)
7471 {
7472 oc->flag = TRUE;
7473 break;
7474 }
7475 }
7476 }
7477
7478 continue; /* End of back ref handling */
7479 }
7480 }
7481
7482 /* First pass, or a non-duplicated name. */
7483
7484 goto HANDLE_REFERENCE;
7485
7486
7487 /* ------------------------------------------------------------ */
7488 case CHAR_R: /* Recursion, same as (?0) */
7489 recno = 0;
7490 if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
7491 {
7492 *errorcodeptr = ERR29;
7493 goto FAILED;
7494 }
7495 goto HANDLE_RECURSION;
7496
7497
7498 /* ------------------------------------------------------------ */
7499 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
7500 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
7501 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
7502 {
7503 const pcre_uchar *called;
7504 terminator = CHAR_RIGHT_PARENTHESIS;
7505
7506 /* Come here from the \g<...> and \g'...' code (Oniguruma
7507 compatibility). However, the syntax has been checked to ensure that
7508 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
7509 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
7510 ever be taken. */
7511
7512 HANDLE_NUMERICAL_RECURSION:
7513
7514 if ((refsign = *ptr) == CHAR_PLUS)
7515 {
7516 ptr++;
7517 if (!IS_DIGIT(*ptr))
7518 {
7519 *errorcodeptr = ERR63;
7520 goto FAILED;
7521 }
7522 }
7523 else if (refsign == CHAR_MINUS)
7524 {
7525 if (!IS_DIGIT(ptr[1]))
7526 goto OTHER_CHAR_AFTER_QUERY;
7527 ptr++;
7528 }
7529
7530 recno = 0;
7531 while(IS_DIGIT(*ptr))
7532 {
7533 if (recno > INT_MAX / 10 - 1) /* Integer overflow */
7534 {
7535 while (IS_DIGIT(*ptr)) ptr++;
7536 *errorcodeptr = ERR61;
7537 goto FAILED;
7538 }
7539 recno = recno * 10 + *ptr++ - CHAR_0;
7540 }
7541
7542 if (*ptr != (pcre_uchar)terminator)
7543 {
7544 *errorcodeptr = ERR29;
7545 goto FAILED;
7546 }
7547
7548 if (refsign == CHAR_MINUS)
7549 {
7550 if (recno == 0)
7551 {
7552 *errorcodeptr = ERR58;
7553 goto FAILED;
7554 }
7555 recno = cd->bracount - recno + 1;
7556 if (recno <= 0)
7557 {
7558 *errorcodeptr = ERR15;
7559 goto FAILED;
7560 }
7561 }
7562 else if (refsign == CHAR_PLUS)
7563 {
7564 if (recno == 0)
7565 {
7566 *errorcodeptr = ERR58;
7567 goto FAILED;
7568 }
7569 recno += cd->bracount;
7570 }
7571
7572 /* Come here from code above that handles a named recursion */
7573
7574 HANDLE_RECURSION:
7575
7576 previous = code;
7577 item_hwm_offset = cd->hwm - cd->start_workspace;
7578 called = cd->start_code;
7579
7580 /* When we are actually compiling, find the bracket that is being
7581 referenced. Temporarily end the regex in case it doesn't exist before
7582 this point. If we end up with a forward reference, first check that
7583 the bracket does occur later so we can give the error (and position)
7584 now. Then remember this forward reference in the workspace so it can
7585 be filled in at the end. */
7586
7587 if (lengthptr == NULL)
7588 {
7589 *code = OP_END;
7590 if (recno != 0)
7591 called = PRIV(find_bracket)(cd->start_code, utf, recno);
7592
7593 /* Forward reference */
7594
7595 if (called == NULL)
7596 {
7597 if (recno > cd->final_bracount)
7598 {
7599 *errorcodeptr = ERR15;
7600 goto FAILED;
7601 }
7602
7603 /* Fudge the value of "called" so that when it is inserted as an
7604 offset below, what it actually inserted is the reference number
7605 of the group. Then remember the forward reference. */
7606
7607 called = cd->start_code + recno;
7608 if (cd->hwm >= cd->start_workspace + cd->workspace_size -
7609 WORK_SIZE_SAFETY_MARGIN)
7610 {
7611 *errorcodeptr = expand_workspace(cd);
7612 if (*errorcodeptr != 0) goto FAILED;
7613 }
7614 PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
7615 }
7616
7617 /* If not a forward reference, and the subpattern is still open,
7618 this is a recursive call. We check to see if this is a left
7619 recursion that could loop for ever, and diagnose that case. We
7620 must not, however, do this check if we are in a conditional
7621 subpattern because the condition might be testing for recursion in
7622 a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
7623 Forever loops are also detected at runtime, so those that occur in
7624 conditional subpatterns will be picked up then. */
7625
7626 else if (GET(called, 1) == 0 && cond_depth <= 0 &&
7627 could_be_empty(called, code, bcptr, utf, cd))
7628 {
7629 *errorcodeptr = ERR40;
7630 goto FAILED;
7631 }
7632 }
7633
7634 /* Insert the recursion/subroutine item. It does not have a set first
7635 character (relevant if it is repeated, because it will then be
7636 wrapped with ONCE brackets). */
7637
7638 *code = OP_RECURSE;
7639 PUT(code, 1, (int)(called - cd->start_code));
7640 code += 1 + LINK_SIZE;
7641 groupsetfirstchar = FALSE;
7642 }
7643
7644 /* Can't determine a first byte now */
7645
7646 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7647 zerofirstchar = firstchar;
7648 zerofirstcharflags = firstcharflags;
7649 continue;
7650
7651
7652 /* ------------------------------------------------------------ */
7653 default: /* Other characters: check option setting */
7654 OTHER_CHAR_AFTER_QUERY:
7655 set = unset = 0;
7656 optset = &set;
7657
7658 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
7659 {
7660 switch (*ptr++)
7661 {
7662 case CHAR_MINUS: optset = &unset; break;
7663
7664 case CHAR_J: /* Record that it changed in the external options */
7665 *optset |= PCRE_DUPNAMES;
7666 cd->external_flags |= PCRE_JCHANGED;
7667 break;
7668
7669 case CHAR_i: *optset |= PCRE_CASELESS; break;
7670 case CHAR_m: *optset |= PCRE_MULTILINE; break;
7671 case CHAR_s: *optset |= PCRE_DOTALL; break;
7672 case CHAR_x: *optset |= PCRE_EXTENDED; break;
7673 case CHAR_U: *optset |= PCRE_UNGREEDY; break;
7674 case CHAR_X: *optset |= PCRE_EXTRA; break;
7675
7676 default: *errorcodeptr = ERR12;
7677 ptr--; /* Correct the offset */
7678 goto FAILED;
7679 }
7680 }
7681
7682 /* Set up the changed option bits, but don't change anything yet. */
7683
7684 newoptions = (options | set) & (~unset);
7685
7686 /* If the options ended with ')' this is not the start of a nested
7687 group with option changes, so the options change at this level.
7688 If we are not at the pattern start, reset the greedy defaults and the
7689 case value for firstchar and reqchar. */
7690
7691 if (*ptr == CHAR_RIGHT_PARENTHESIS)
7692 {
7693 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
7694 greedy_non_default = greedy_default ^ 1;
7695 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
7696
7697 /* Change options at this level, and pass them back for use
7698 in subsequent branches. */
7699
7700 *optionsptr = options = newoptions;
7701 previous = NULL; /* This item can't be repeated */
7702 continue; /* It is complete */
7703 }
7704
7705 /* If the options ended with ':' we are heading into a nested group
7706 with possible change of options. Such groups are non-capturing and are
7707 not assertions of any kind. All we need to do is skip over the ':';
7708 the newoptions value is handled below. */
7709
7710 bravalue = OP_BRA;
7711 ptr++;
7712 } /* End of switch for character following (? */
7713 } /* End of (? handling */
7714
7715 /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
7716 is set, all unadorned brackets become non-capturing and behave like (?:...)
7717 brackets. */
7718
7719 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
7720 {
7721 bravalue = OP_BRA;
7722 }
7723
7724 /* Else we have a capturing group. */
7725
7726 else
7727 {
7728 NUMBERED_GROUP:
7729 cd->bracount += 1;
7730 PUT2(code, 1+LINK_SIZE, cd->bracount);
7731 skipbytes = IMM2_SIZE;
7732 }
7733
7734 /* Process nested bracketed regex. First check for parentheses nested too
7735 deeply. */
7736
7737 if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7738 {
7739 *errorcodeptr = ERR82;
7740 goto FAILED;
7741 }
7742
7743 /* All assertions used not to be repeatable, but this was changed for Perl
7744 compatibility. All kinds can now be repeated except for assertions that are
7745 conditions (Perl also forbids these to be repeated). We copy code into a
7746 non-register variable (tempcode) in order to be able to pass its address
7747 because some compilers complain otherwise. At the start of a conditional
7748 group whose condition is an assertion, cd->iscondassert is set. We unset it
7749 here so as to allow assertions later in the group to be quantified. */
7750
7751 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
7752 cd->iscondassert)
7753 {
7754 previous = NULL;
7755 cd->iscondassert = FALSE;
7756 }
7757 else
7758 {
7759 previous = code;
7760 item_hwm_offset = cd->hwm - cd->start_workspace;
7761 }
7762
7763 *code = bravalue;
7764 tempcode = code;
7765 tempreqvary = cd->req_varyopt; /* Save value before bracket */
7766 tempbracount = cd->bracount; /* Save value before bracket */
7767 length_prevgroup = 0; /* Initialize for pre-compile phase */
7768
7769 if (!compile_regex(
7770 newoptions, /* The complete new option state */
7771 &tempcode, /* Where to put code (updated) */
7772 &ptr, /* Input pointer (updated) */
7773 errorcodeptr, /* Where to put an error message */
7774 (bravalue == OP_ASSERTBACK ||
7775 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
7776 reset_bracount, /* True if (?| group */
7777 skipbytes, /* Skip over bracket number */
7778 cond_depth +
7779 ((bravalue == OP_COND)?1:0), /* Depth of condition subpatterns */
7780 &subfirstchar, /* For possible first char */
7781 &subfirstcharflags,
7782 &subreqchar, /* For possible last char */
7783 &subreqcharflags,
7784 bcptr, /* Current branch chain */
7785 cd, /* Tables block */
7786 (lengthptr == NULL)? NULL : /* Actual compile phase */
7787 &length_prevgroup /* Pre-compile phase */
7788 ))
7789 goto FAILED;
7790
7791 cd->parens_depth -= 1;
7792
7793 /* If this was an atomic group and there are no capturing groups within it,
7794 generate OP_ONCE_NC instead of OP_ONCE. */
7795
7796 if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
7797 *code = OP_ONCE_NC;
7798
7799 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
7800 cd->assert_depth -= 1;
7801
7802 /* At the end of compiling, code is still pointing to the start of the
7803 group, while tempcode has been updated to point past the end of the group.
7804 The pattern pointer (ptr) is on the bracket.
7805
7806 If this is a conditional bracket, check that there are no more than
7807 two branches in the group, or just one if it's a DEFINE group. We do this
7808 in the real compile phase, not in the pre-pass, where the whole group may
7809 not be available. */
7810
7811 if (bravalue == OP_COND && lengthptr == NULL)
7812 {
7813 pcre_uchar *tc = code;
7814 int condcount = 0;
7815
7816 do {
7817 condcount++;
7818 tc += GET(tc,1);
7819 }
7820 while (*tc != OP_KET);
7821
7822 /* A DEFINE group is never obeyed inline (the "condition" is always
7823 false). It must have only one branch. */
7824
7825 if (code[LINK_SIZE+1] == OP_DEF)
7826 {
7827 if (condcount > 1)
7828 {
7829 *errorcodeptr = ERR54;
7830 goto FAILED;
7831 }
7832 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
7833 }
7834
7835 /* A "normal" conditional group. If there is just one branch, we must not
7836 make use of its firstchar or reqchar, because this is equivalent to an
7837 empty second branch. */
7838
7839 else
7840 {
7841 if (condcount > 2)
7842 {
7843 *errorcodeptr = ERR27;
7844 goto FAILED;
7845 }
7846 if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
7847 }
7848 }
7849
7850 /* Error if hit end of pattern */
7851
7852 if (*ptr != CHAR_RIGHT_PARENTHESIS)
7853 {
7854 *errorcodeptr = ERR14;
7855 goto FAILED;
7856 }
7857
7858 /* In the pre-compile phase, update the length by the length of the group,
7859 less the brackets at either end. Then reduce the compiled code to just a
7860 set of non-capturing brackets so that it doesn't use much memory if it is
7861 duplicated by a quantifier.*/
7862
7863 if (lengthptr != NULL)
7864 {
7865 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7866 {
7867 *errorcodeptr = ERR20;
7868 goto FAILED;
7869 }
7870 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7871 code++; /* This already contains bravalue */
7872 PUTINC(code, 0, 1 + LINK_SIZE);
7873 *code++ = OP_KET;
7874 PUTINC(code, 0, 1 + LINK_SIZE);
7875 break; /* No need to waste time with special character handling */
7876 }
7877
7878 /* Otherwise update the main code pointer to the end of the group. */
7879
7880 code = tempcode;
7881
7882 /* For a DEFINE group, required and first character settings are not
7883 relevant. */
7884
7885 if (bravalue == OP_DEF) break;
7886
7887 /* Handle updating of the required and first characters for other types of
7888 group. Update for normal brackets of all kinds, and conditions with two
7889 branches (see code above). If the bracket is followed by a quantifier with
7890 zero repeat, we have to back off. Hence the definition of zeroreqchar and
7891 zerofirstchar outside the main loop so that they can be accessed for the
7892 back off. */
7893
7894 zeroreqchar = reqchar;
7895 zeroreqcharflags = reqcharflags;
7896 zerofirstchar = firstchar;
7897 zerofirstcharflags = firstcharflags;
7898 groupsetfirstchar = FALSE;
7899
7900 if (bravalue >= OP_ONCE)
7901 {
7902 /* If we have not yet set a firstchar in this branch, take it from the
7903 subpattern, remembering that it was set here so that a repeat of more
7904 than one can replicate it as reqchar if necessary. If the subpattern has
7905 no firstchar, set "none" for the whole branch. In both cases, a zero
7906 repeat forces firstchar to "none". */
7907
7908 if (firstcharflags == REQ_UNSET)
7909 {
7910 if (subfirstcharflags >= 0)
7911 {
7912 firstchar = subfirstchar;
7913 firstcharflags = subfirstcharflags;
7914 groupsetfirstchar = TRUE;
7915 }
7916 else firstcharflags = REQ_NONE;
7917 zerofirstcharflags = REQ_NONE;
7918 }
7919
7920 /* If firstchar was previously set, convert the subpattern's firstchar
7921 into reqchar if there wasn't one, using the vary flag that was in
7922 existence beforehand. */
7923
7924 else if (subfirstcharflags >= 0 && subreqcharflags < 0)
7925 {
7926 subreqchar = subfirstchar;
7927 subreqcharflags = subfirstcharflags | tempreqvary;
7928 }
7929
7930 /* If the subpattern set a required byte (or set a first byte that isn't
7931 really the first byte - see above), set it. */
7932
7933 if (subreqcharflags >= 0)
7934 {
7935 reqchar = subreqchar;
7936 reqcharflags = subreqcharflags;
7937 }
7938 }
7939
7940 /* For a forward assertion, we take the reqchar, if set, provided that the
7941 group has also set a first char. This can be helpful if the pattern that
7942 follows the assertion doesn't set a different char. For example, it's
7943 useful for /(?=abcde).+/. We can't set firstchar for an assertion, however
7944 because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7945 the "real" "a" would then become a reqchar instead of a firstchar. This is
7946 overcome by a scan at the end if there's no firstchar, looking for an
7947 asserted first char. */
7948
7949 else if (bravalue == OP_ASSERT && subreqcharflags >= 0 &&
7950 subfirstcharflags >= 0)
7951 {
7952 reqchar = subreqchar;
7953 reqcharflags = subreqcharflags;
7954 }
7955 break; /* End of processing '(' */
7956
7957
7958 /* ===================================================================*/
7959 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
7960 are arranged to be the negation of the corresponding OP_values in the
7961 default case when PCRE_UCP is not set. For the back references, the values
7962 are negative the reference number. Only back references and those types
7963 that consume a character may be repeated. We can test for values between
7964 ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7965 ever created. */
7966
7967 case CHAR_BACKSLASH:
7968 tempptr = ptr;
7969 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
7970 if (*errorcodeptr != 0) goto FAILED;
7971
7972 if (escape == 0) /* The escape coded a single character */
7973 c = ec;
7974 else
7975 {
7976 /* For metasequences that actually match a character, we disable the
7977 setting of a first character if it hasn't already been set. */
7978
7979 if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
7980 firstcharflags = REQ_NONE;
7981
7982 /* Set values to reset to if this is followed by a zero repeat. */
7983
7984 zerofirstchar = firstchar;
7985 zerofirstcharflags = firstcharflags;
7986 zeroreqchar = reqchar;
7987 zeroreqcharflags = reqcharflags;
7988
7989 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
7990 is a subroutine call by number (Oniguruma syntax). In fact, the value
7991 ESC_g is returned only for these cases. So we don't need to check for <
7992 or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7993 -n, and for the Perl syntax \g{name} the result is ESC_k (as
7994 that is a synonym for a named back reference). */
7995
7996 if (escape == ESC_g)
7997 {
7998 const pcre_uchar *p;
7999 pcre_uint32 cf;
8000
8001 item_hwm_offset = cd->hwm - cd->start_workspace; /* Normally this is set when '(' is read */
8002 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
8003 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
8004
8005 /* These two statements stop the compiler for warning about possibly
8006 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
8007 fact, because we do the check for a number below, the paths that
8008 would actually be in error are never taken. */
8009
8010 skipbytes = 0;
8011 reset_bracount = FALSE;
8012
8013 /* If it's not a signed or unsigned number, treat it as a name. */
8014
8015 cf = ptr[1];
8016 if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
8017 {
8018 is_recurse = TRUE;
8019 goto NAMED_REF_OR_RECURSE;
8020 }
8021
8022 /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
8023 or a digit. */
8024
8025 p = ptr + 2;
8026 while (IS_DIGIT(*p)) p++;
8027 if (*p != (pcre_uchar)terminator)
8028 {
8029 *errorcodeptr = ERR57;
8030 goto FAILED;
8031 }
8032 ptr++;
8033 goto HANDLE_NUMERICAL_RECURSION;
8034 }
8035
8036 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
8037 We also support \k{name} (.NET syntax). */
8038
8039 if (escape == ESC_k)
8040 {
8041 if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
8042 ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
8043 {
8044 *errorcodeptr = ERR69;
8045 goto FAILED;
8046 }
8047 is_recurse = FALSE;
8048 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
8049 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
8050 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
8051 goto NAMED_REF_OR_RECURSE;
8052 }
8053
8054 /* Back references are handled specially; must disable firstchar if
8055 not set to cope with cases like (?=(\w+))\1: which would otherwise set
8056 ':' later. */
8057
8058 if (escape < 0)
8059 {
8060 open_capitem *oc;
8061 recno = -escape;
8062
8063 /* Come here from named backref handling when the reference is to a
8064 single group (i.e. not to a duplicated name. */
8065
8066 HANDLE_REFERENCE:
8067 if (firstcharflags == REQ_UNSET) zerofirstcharflags = firstcharflags = REQ_NONE;
8068 previous = code;
8069 item_hwm_offset = cd->hwm - cd->start_workspace;
8070 *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
8071 PUT2INC(code, 0, recno);
8072 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
8073 if (recno > cd->top_backref) cd->top_backref = recno;
8074
8075 /* Check to see if this back reference is recursive, that it, it
8076 is inside the group that it references. A flag is set so that the
8077 group can be made atomic. */
8078
8079 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
8080 {
8081 if (oc->number == recno)
8082 {
8083 oc->flag = TRUE;
8084 break;
8085 }
8086 }
8087 }
8088
8089 /* So are Unicode property matches, if supported. */
8090
8091#ifdef SUPPORT_UCP
8092 else if (escape == ESC_P || escape == ESC_p)
8093 {
8094 BOOL negated;
8095 unsigned int ptype = 0, pdata = 0;
8096 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
8097 goto FAILED;
8098 previous = code;
8099 item_hwm_offset = cd->hwm - cd->start_workspace;
8100 *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
8101 *code++ = ptype;
8102 *code++ = pdata;
8103 }
8104#else
8105
8106 /* If Unicode properties are not supported, \X, \P, and \p are not
8107 allowed. */
8108
8109 else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
8110 {
8111 *errorcodeptr = ERR45;
8112 goto FAILED;
8113 }
8114#endif
8115
8116 /* For the rest (including \X when Unicode properties are supported), we
8117 can obtain the OP value by negating the escape value in the default
8118 situation when PCRE_UCP is not set. When it *is* set, we substitute
8119 Unicode property tests. Note that \b and \B do a one-character
8120 lookbehind, and \A also behaves as if it does. */
8121
8122 else
8123 {
8124 if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
8125 cd->max_lookbehind == 0)
8126 cd->max_lookbehind = 1;
8127#ifdef SUPPORT_UCP
8128 if (escape >= ESC_DU && escape <= ESC_wu)
8129 {
8130 nestptr = ptr + 1; /* Where to resume */
8131 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
8132 }
8133 else
8134#endif
8135 /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
8136 so that it works in DFA mode and in lookbehinds. */
8137
8138 {
8139 previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
8140 item_hwm_offset = cd->hwm - cd->start_workspace;
8141 *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
8142 }
8143 }
8144 continue;
8145 }
8146
8147 /* We have a data character whose value is in c. In UTF-8 mode it may have
8148 a value > 127. We set its representation in the length/buffer, and then
8149 handle it as a data character. */
8150
8151#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
8152 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
8153 mclength = PRIV(ord2utf)(c, mcbuffer);
8154 else
8155#endif
8156
8157 {
8158 mcbuffer[0] = c;
8159 mclength = 1;
8160 }
8161 goto ONE_CHAR;
8162
8163
8164 /* ===================================================================*/
8165 /* Handle a literal character. It is guaranteed not to be whitespace or #
8166 when the extended flag is set. If we are in a UTF mode, it may be a
8167 multi-unit literal character. */
8168
8169 default:
8170 NORMAL_CHAR:
8171 mclength = 1;
8172 mcbuffer[0] = c;
8173
8174#ifdef SUPPORT_UTF
8175 if (utf && HAS_EXTRALEN(c))
8176 ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
8177#endif
8178
8179 /* At this point we have the character's bytes in mcbuffer, and the length
8180 in mclength. When not in UTF-8 mode, the length is always 1. */
8181
8182 ONE_CHAR:
8183 previous = code;
8184 item_hwm_offset = cd->hwm - cd->start_workspace;
8185
8186 /* For caseless UTF-8 mode when UCP support is available, check whether
8187 this character has more than one other case. If so, generate a special
8188 OP_PROP item instead of OP_CHARI. */
8189
8190#ifdef SUPPORT_UCP
8191 if (utf && (options & PCRE_CASELESS) != 0)
8192 {
8193 GETCHAR(c, mcbuffer);
8194 if ((c = UCD_CASESET(c)) != 0)
8195 {
8196 *code++ = OP_PROP;
8197 *code++ = PT_CLIST;
8198 *code++ = c;
8199 if (firstcharflags == REQ_UNSET)
8200 firstcharflags = zerofirstcharflags = REQ_NONE;
8201 break;
8202 }
8203 }
8204#endif
8205
8206 /* Caseful matches, or not one of the multicase characters. */
8207
8208 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8209 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
8210
8211 /* Remember if \r or \n were seen */
8212
8213 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8214 cd->external_flags |= PCRE_HASCRORLF;
8215
8216 /* Set the first and required bytes appropriately. If no previous first
8217 byte, set it from this character, but revert to none on a zero repeat.
8218 Otherwise, leave the firstchar value alone, and don't change it on a zero
8219 repeat. */
8220
8221 if (firstcharflags == REQ_UNSET)
8222 {
8223 zerofirstcharflags = REQ_NONE;
8224 zeroreqchar = reqchar;
8225 zeroreqcharflags = reqcharflags;
8226
8227 /* If the character is more than one byte long, we can set firstchar
8228 only if it is not to be matched caselessly. */
8229
8230 if (mclength == 1 || req_caseopt == 0)
8231 {
8232 firstchar = mcbuffer[0];
8233 firstcharflags = req_caseopt;
8234
8235 if (mclength != 1)
8236 {
8237 reqchar = code[-1];
8238 reqcharflags = cd->req_varyopt;
8239 }
8240 }
8241 else firstcharflags = reqcharflags = REQ_NONE;
8242 }
8243
8244 /* firstchar was previously set; we can set reqchar only if the length is
8245 1 or the matching is caseful. */
8246
8247 else
8248 {
8249 zerofirstchar = firstchar;
8250 zerofirstcharflags = firstcharflags;
8251 zeroreqchar = reqchar;
8252 zeroreqcharflags = reqcharflags;
8253 if (mclength == 1 || req_caseopt == 0)
8254 {
8255 reqchar = code[-1];
8256 reqcharflags = req_caseopt | cd->req_varyopt;
8257 }
8258 }
8259
8260 break; /* End of literal character handling */
8261 }
8262 } /* end of big loop */
8263
8264
8265/* Control never reaches here by falling through, only by a goto for all the
8266error states. Pass back the position in the pattern so that it can be displayed
8267to the user for diagnosing the error. */
8268
8269FAILED:
8270*ptrptr = ptr;
8271return FALSE;
8272}
8273
8274
8275
8276/*************************************************
8277* Compile sequence of alternatives *
8278*************************************************/
8279
8280/* On entry, ptr is pointing past the bracket character, but on return it
8281points to the closing bracket, or vertical bar, or end of string. The code
8282variable is pointing at the byte into which the BRA operator has been stored.
8283This function is used during the pre-compile phase when we are trying to find
8284out the amount of memory needed, as well as during the real compile phase. The
8285value of lengthptr distinguishes the two phases.
8286
8287Arguments:
8288 options option bits, including any changes for this subpattern
8289 codeptr -> the address of the current code pointer
8290 ptrptr -> the address of the current pattern pointer
8291 errorcodeptr -> pointer to error code variable
8292 lookbehind TRUE if this is a lookbehind assertion
8293 reset_bracount TRUE to reset the count for each branch
8294 skipbytes skip this many bytes at start (for brackets and OP_COND)
8295 cond_depth depth of nesting for conditional subpatterns
8296 firstcharptr place to put the first required character
8297 firstcharflagsptr place to put the first character flags, or a negative number
8298 reqcharptr place to put the last required character
8299 reqcharflagsptr place to put the last required character flags, or a negative number
8300 bcptr pointer to the chain of currently open branches
8301 cd points to the data block with tables pointers etc.
8302 lengthptr NULL during the real compile phase
8303 points to length accumulator during pre-compile phase
8304
8305Returns: TRUE on success
8306*/
8307
8308static BOOL
8309compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
8310 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
8311 int cond_depth,
8312 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
8313 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
8314 branch_chain *bcptr, compile_data *cd, int *lengthptr)
8315{
8316const pcre_uchar *ptr = *ptrptr;
8317pcre_uchar *code = *codeptr;
8318pcre_uchar *last_branch = code;
8319pcre_uchar *start_bracket = code;
8320pcre_uchar *reverse_count = NULL;
8321open_capitem capitem;
8322int capnumber = 0;
8323pcre_uint32 firstchar, reqchar;
8324pcre_int32 firstcharflags, reqcharflags;
8325pcre_uint32 branchfirstchar, branchreqchar;
8326pcre_int32 branchfirstcharflags, branchreqcharflags;
8327int length;
8328unsigned int orig_bracount;
8329unsigned int max_bracount;
8330branch_chain bc;
8331size_t save_hwm_offset;
8332
8333/* If set, call the external function that checks for stack availability. */
8334
8335if (PUBL(stack_guard) != NULL && PUBL(stack_guard)())
8336 {
8337 *errorcodeptr= ERR85;
8338 return FALSE;
8339 }
8340
8341/* Miscellaneous initialization */
8342
8343bc.outer = bcptr;
8344bc.current_branch = code;
8345
8346firstchar = reqchar = 0;
8347firstcharflags = reqcharflags = REQ_UNSET;
8348
8349save_hwm_offset = cd->hwm - cd->start_workspace;
8350
8351/* Accumulate the length for use in the pre-compile phase. Start with the
8352length of the BRA and KET and any extra bytes that are required at the
8353beginning. We accumulate in a local variable to save frequent testing of
8354lenthptr for NULL. We cannot do this by looking at the value of code at the
8355start and end of each alternative, because compiled items are discarded during
8356the pre-compile phase so that the work space is not exceeded. */
8357
8358length = 2 + 2*LINK_SIZE + skipbytes;
8359
8360/* WARNING: If the above line is changed for any reason, you must also change
8361the code that abstracts option settings at the start of the pattern and makes
8362them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
8363pre-compile phase to find out whether anything has yet been compiled or not. */
8364
8365/* If this is a capturing subpattern, add to the chain of open capturing items
8366so that we can detect them if (*ACCEPT) is encountered. This is also used to
8367detect groups that contain recursive back references to themselves. Note that
8368only OP_CBRA need be tested here; changing this opcode to one of its variants,
8369e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
8370
8371if (*code == OP_CBRA)
8372 {
8373 capnumber = GET2(code, 1 + LINK_SIZE);
8374 capitem.number = capnumber;
8375 capitem.next = cd->open_caps;
8376 capitem.flag = FALSE;
8377 cd->open_caps = &capitem;
8378 }
8379
8380/* Offset is set zero to mark that this bracket is still open */
8381
8382PUT(code, 1, 0);
8383code += 1 + LINK_SIZE + skipbytes;
8384
8385/* Loop for each alternative branch */
8386
8387orig_bracount = max_bracount = cd->bracount;
8388for (;;)
8389 {
8390 /* For a (?| group, reset the capturing bracket count so that each branch
8391 uses the same numbers. */
8392
8393 if (reset_bracount) cd->bracount = orig_bracount;
8394
8395 /* Set up dummy OP_REVERSE if lookbehind assertion */
8396
8397 if (lookbehind)
8398 {
8399 *code++ = OP_REVERSE;
8400 reverse_count = code;
8401 PUTINC(code, 0, 0);
8402 length += 1 + LINK_SIZE;
8403 }
8404
8405 /* Now compile the branch; in the pre-compile phase its length gets added
8406 into the length. */
8407
8408 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
8409 &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
8410 cond_depth, cd, (lengthptr == NULL)? NULL : &length))
8411 {
8412 *ptrptr = ptr;
8413 return FALSE;
8414 }
8415
8416 /* Keep the highest bracket count in case (?| was used and some branch
8417 has fewer than the rest. */
8418
8419 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
8420
8421 /* In the real compile phase, there is some post-processing to be done. */
8422
8423 if (lengthptr == NULL)
8424 {
8425 /* If this is the first branch, the firstchar and reqchar values for the
8426 branch become the values for the regex. */
8427
8428 if (*last_branch != OP_ALT)
8429 {
8430 firstchar = branchfirstchar;
8431 firstcharflags = branchfirstcharflags;
8432 reqchar = branchreqchar;
8433 reqcharflags = branchreqcharflags;
8434 }
8435
8436 /* If this is not the first branch, the first char and reqchar have to
8437 match the values from all the previous branches, except that if the
8438 previous value for reqchar didn't have REQ_VARY set, it can still match,
8439 and we set REQ_VARY for the regex. */
8440
8441 else
8442 {
8443 /* If we previously had a firstchar, but it doesn't match the new branch,
8444 we have to abandon the firstchar for the regex, but if there was
8445 previously no reqchar, it takes on the value of the old firstchar. */
8446
8447 if (firstcharflags >= 0 &&
8448 (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar))
8449 {
8450 if (reqcharflags < 0)
8451 {
8452 reqchar = firstchar;
8453 reqcharflags = firstcharflags;
8454 }
8455 firstcharflags = REQ_NONE;
8456 }
8457
8458 /* If we (now or from before) have no firstchar, a firstchar from the
8459 branch becomes a reqchar if there isn't a branch reqchar. */
8460
8461 if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0)
8462 {
8463 branchreqchar = branchfirstchar;
8464 branchreqcharflags = branchfirstcharflags;
8465 }
8466
8467 /* Now ensure that the reqchars match */
8468
8469 if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) ||
8470 reqchar != branchreqchar)
8471 reqcharflags = REQ_NONE;
8472 else
8473 {
8474 reqchar = branchreqchar;
8475 reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */
8476 }
8477 }
8478
8479 /* If lookbehind, check that this branch matches a fixed-length string, and
8480 put the length into the OP_REVERSE item. Temporarily mark the end of the
8481 branch with OP_END. If the branch contains OP_RECURSE, the result is -3
8482 because there may be forward references that we can't check here. Set a
8483 flag to cause another lookbehind check at the end. Why not do it all at the
8484 end? Because common, erroneous checks are picked up here and the offset of
8485 the problem can be shown. */
8486
8487 if (lookbehind)
8488 {
8489 int fixed_length;
8490 *code = OP_END;
8491 fixed_length = find_fixedlength(last_branch, (options & PCRE_UTF8) != 0,
8492 FALSE, cd, NULL);
8493 DPRINTF(("fixed length = %d\n", fixed_length));
8494 if (fixed_length == -3)
8495 {
8496 cd->check_lookbehind = TRUE;
8497 }
8498 else if (fixed_length < 0)
8499 {
8500 *errorcodeptr = (fixed_length == -2)? ERR36 :
8501 (fixed_length == -4)? ERR70: ERR25;
8502 *ptrptr = ptr;
8503 return FALSE;
8504 }
8505 else
8506 {
8507 if (fixed_length > cd->max_lookbehind)
8508 cd->max_lookbehind = fixed_length;
8509 PUT(reverse_count, 0, fixed_length);
8510 }
8511 }
8512 }
8513
8514 /* Reached end of expression, either ')' or end of pattern. In the real
8515 compile phase, go back through the alternative branches and reverse the chain
8516 of offsets, with the field in the BRA item now becoming an offset to the
8517 first alternative. If there are no alternatives, it points to the end of the
8518 group. The length in the terminating ket is always the length of the whole
8519 bracketed item. Return leaving the pointer at the terminating char. */
8520
8521 if (*ptr != CHAR_VERTICAL_LINE)
8522 {
8523 if (lengthptr == NULL)
8524 {
8525 int branch_length = (int)(code - last_branch);
8526 do
8527 {
8528 int prev_length = GET(last_branch, 1);
8529 PUT(last_branch, 1, branch_length);
8530 branch_length = prev_length;
8531 last_branch -= branch_length;
8532 }
8533 while (branch_length > 0);
8534 }
8535
8536 /* Fill in the ket */
8537
8538 *code = OP_KET;
8539 PUT(code, 1, (int)(code - start_bracket));
8540 code += 1 + LINK_SIZE;
8541
8542 /* If it was a capturing subpattern, check to see if it contained any
8543 recursive back references. If so, we must wrap it in atomic brackets.
8544 Because we are moving code along, we must ensure that any pending recursive
8545 references are updated. In any event, remove the block from the chain. */
8546
8547 if (capnumber > 0)
8548 {
8549 if (cd->open_caps->flag)
8550 {
8551 *code = OP_END;
8552 adjust_recurse(start_bracket, 1 + LINK_SIZE,
8553 (options & PCRE_UTF8) != 0, cd, save_hwm_offset);
8554 memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
8555 IN_UCHARS(code - start_bracket));
8556 *start_bracket = OP_ONCE;
8557 code += 1 + LINK_SIZE;
8558 PUT(start_bracket, 1, (int)(code - start_bracket));
8559 *code = OP_KET;
8560 PUT(code, 1, (int)(code - start_bracket));
8561 code += 1 + LINK_SIZE;
8562 length += 2 + 2*LINK_SIZE;
8563 }
8564 cd->open_caps = cd->open_caps->next;
8565 }
8566
8567 /* Retain the highest bracket number, in case resetting was used. */
8568
8569 cd->bracount = max_bracount;
8570
8571 /* Set values to pass back */
8572
8573 *codeptr = code;
8574 *ptrptr = ptr;
8575 *firstcharptr = firstchar;
8576 *firstcharflagsptr = firstcharflags;
8577 *reqcharptr = reqchar;
8578 *reqcharflagsptr = reqcharflags;
8579 if (lengthptr != NULL)
8580 {
8581 if (OFLOW_MAX - *lengthptr < length)
8582 {
8583 *errorcodeptr = ERR20;
8584 return FALSE;
8585 }
8586 *lengthptr += length;
8587 }
8588 return TRUE;
8589 }
8590
8591 /* Another branch follows. In the pre-compile phase, we can move the code
8592 pointer back to where it was for the start of the first branch. (That is,
8593 pretend that each branch is the only one.)
8594
8595 In the real compile phase, insert an ALT node. Its length field points back
8596 to the previous branch while the bracket remains open. At the end the chain
8597 is reversed. It's done like this so that the start of the bracket has a
8598 zero offset until it is closed, making it possible to detect recursion. */
8599
8600 if (lengthptr != NULL)
8601 {
8602 code = *codeptr + 1 + LINK_SIZE + skipbytes;
8603 length += 1 + LINK_SIZE;
8604 }
8605 else
8606 {
8607 *code = OP_ALT;
8608 PUT(code, 1, (int)(code - last_branch));
8609 bc.current_branch = last_branch = code;
8610 code += 1 + LINK_SIZE;
8611 }
8612
8613 ptr++;
8614 }
8615/* Control never reaches here */
8616}
8617
8618
8619
8620
8621/*************************************************
8622* Check for anchored expression *
8623*************************************************/
8624
8625/* Try to find out if this is an anchored regular expression. Consider each
8626alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8627all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8628it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8629be found, because ^ generates OP_CIRCM in that mode.
8630
8631We can also consider a regex to be anchored if OP_SOM starts all its branches.
8632This is the code for \G, which means "match at start of match position, taking
8633into account the match offset".
8634
8635A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8636because that will try the rest of the pattern at all possible matching points,
8637so there is no point trying again.... er ....
8638
8639.... except when the .* appears inside capturing parentheses, and there is a
8640subsequent back reference to those parentheses. We haven't enough information
8641to catch that case precisely.
8642
8643At first, the best we could do was to detect when .* was in capturing brackets
8644and the highest back reference was greater than or equal to that level.
8645However, by keeping a bitmap of the first 31 back references, we can catch some
8646of the more common cases more precisely.
8647
8648... A second exception is when the .* appears inside an atomic group, because
8649this prevents the number of characters it matches from being adjusted.
8650
8651Arguments:
8652 code points to start of expression (the bracket)
8653 bracket_map a bitmap of which brackets we are inside while testing; this
8654 handles up to substring 31; after that we just have to take
8655 the less precise approach
8656 cd points to the compile data block
8657 atomcount atomic group level
8658
8659Returns: TRUE or FALSE
8660*/
8661
8662static BOOL
8663is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
8664 compile_data *cd, int atomcount)
8665{
8666do {
8667 const pcre_uchar *scode = first_significant_code(
8668 code + PRIV(OP_lengths)[*code], FALSE);
8669 register int op = *scode;
8670
8671 /* Non-capturing brackets */
8672
8673 if (op == OP_BRA || op == OP_BRAPOS ||
8674 op == OP_SBRA || op == OP_SBRAPOS)
8675 {
8676 if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8677 }
8678
8679 /* Capturing brackets */
8680
8681 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8682 op == OP_SCBRA || op == OP_SCBRAPOS)
8683 {
8684 int n = GET2(scode, 1+LINK_SIZE);
8685 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8686 if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
8687 }
8688
8689 /* Positive forward assertion */
8690
8691 else if (op == OP_ASSERT)
8692 {
8693 if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8694 }
8695
8696 /* Condition; not anchored if no second branch */
8697
8698 else if (op == OP_COND)
8699 {
8700 if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8701 if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8702 }
8703
8704 /* Atomic groups */
8705
8706 else if (op == OP_ONCE || op == OP_ONCE_NC)
8707 {
8708 if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
8709 return FALSE;
8710 }
8711
8712 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8713 it isn't in brackets that are or may be referenced or inside an atomic
8714 group. */
8715
8716 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8717 op == OP_TYPEPOSSTAR))
8718 {
8719 if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
8720 atomcount > 0 || cd->had_pruneorskip)
8721 return FALSE;
8722 }
8723
8724 /* Check for explicit anchoring */
8725
8726 else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8727
8728 code += GET(code, 1);
8729 }
8730while (*code == OP_ALT); /* Loop for each alternative */
8731return TRUE;
8732}
8733
8734
8735
8736/*************************************************
8737* Check for starting with ^ or .* *
8738*************************************************/
8739
8740/* This is called to find out if every branch starts with ^ or .* so that
8741"first char" processing can be done to speed things up in multiline
8742matching and for non-DOTALL patterns that start with .* (which must start at
8743the beginning or after \n). As in the case of is_anchored() (see above), we
8744have to take account of back references to capturing brackets that contain .*
8745because in that case we can't make the assumption. Also, the appearance of .*
8746inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8747or *SKIP does not count, because once again the assumption no longer holds.
8748
8749Arguments:
8750 code points to start of expression (the bracket)
8751 bracket_map a bitmap of which brackets we are inside while testing; this
8752 handles up to substring 31; after that we just have to take
8753 the less precise approach
8754 cd points to the compile data
8755 atomcount atomic group level
8756 inassert TRUE if in an assertion
8757
8758Returns: TRUE or FALSE
8759*/
8760
8761static BOOL
8762is_startline(const pcre_uchar *code, unsigned int bracket_map,
8763 compile_data *cd, int atomcount, BOOL inassert)
8764{
8765do {
8766 const pcre_uchar *scode = first_significant_code(
8767 code + PRIV(OP_lengths)[*code], FALSE);
8768 register int op = *scode;
8769
8770 /* If we are at the start of a conditional assertion group, *both* the
8771 conditional assertion *and* what follows the condition must satisfy the test
8772 for start of line. Other kinds of condition fail. Note that there may be an
8773 auto-callout at the start of a condition. */
8774
8775 if (op == OP_COND)
8776 {
8777 scode += 1 + LINK_SIZE;
8778 if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8779 switch (*scode)
8780 {
8781 case OP_CREF:
8782 case OP_DNCREF:
8783 case OP_RREF:
8784 case OP_DNRREF:
8785 case OP_DEF:
8786 case OP_FAIL:
8787 return FALSE;
8788
8789 default: /* Assertion */
8790 if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
8791 do scode += GET(scode, 1); while (*scode == OP_ALT);
8792 scode += 1 + LINK_SIZE;
8793 break;
8794 }
8795 scode = first_significant_code(scode, FALSE);
8796 op = *scode;
8797 }
8798
8799 /* Non-capturing brackets */
8800
8801 if (op == OP_BRA || op == OP_BRAPOS ||
8802 op == OP_SBRA || op == OP_SBRAPOS)
8803 {
8804 if (!is_startline(scode, bracket_map, cd, atomcount, inassert)) return FALSE;
8805 }
8806
8807 /* Capturing brackets */
8808
8809 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8810 op == OP_SCBRA || op == OP_SCBRAPOS)
8811 {
8812 int n = GET2(scode, 1+LINK_SIZE);
8813 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8814 if (!is_startline(scode, new_map, cd, atomcount, inassert)) return FALSE;
8815 }
8816
8817 /* Positive forward assertions */
8818
8819 else if (op == OP_ASSERT)
8820 {
8821 if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
8822 }
8823
8824 /* Atomic brackets */
8825
8826 else if (op == OP_ONCE || op == OP_ONCE_NC)
8827 {
8828 if (!is_startline(scode, bracket_map, cd, atomcount + 1, inassert)) return FALSE;
8829 }
8830
8831 /* .* means "start at start or after \n" if it isn't in atomic brackets or
8832 brackets that may be referenced or an assertion, as long as the pattern does
8833 not contain *PRUNE or *SKIP, because these break the feature. Consider, for
8834 example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e.
8835 not at the start of a line. */
8836
8837 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8838 {
8839 if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
8840 atomcount > 0 || cd->had_pruneorskip || inassert)
8841 return FALSE;
8842 }
8843
8844 /* Check for explicit circumflex; anything else gives a FALSE result. Note
8845 in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8846 because the number of characters matched by .* cannot be adjusted inside
8847 them. */
8848
8849 else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8850
8851 /* Move on to the next alternative */
8852
8853 code += GET(code, 1);
8854 }
8855while (*code == OP_ALT); /* Loop for each alternative */
8856return TRUE;
8857}
8858
8859
8860
8861/*************************************************
8862* Check for asserted fixed first char *
8863*************************************************/
8864
8865/* During compilation, the "first char" settings from forward assertions are
8866discarded, because they can cause conflicts with actual literals that follow.
8867However, if we end up without a first char setting for an unanchored pattern,
8868it is worth scanning the regex to see if there is an initial asserted first
8869char. If all branches start with the same asserted char, or with a
8870non-conditional bracket all of whose alternatives start with the same asserted
8871char (recurse ad lib), then we return that char, with the flags set to zero or
8872REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
8873
8874Arguments:
8875 code points to start of expression (the bracket)
8876 flags points to the first char flags, or to REQ_NONE
8877 inassert TRUE if in an assertion
8878
8879Returns: the fixed first char, or 0 with REQ_NONE in flags
8880*/
8881
8882static pcre_uint32
8883find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags,
8884 BOOL inassert)
8885{
8886register pcre_uint32 c = 0;
8887int cflags = REQ_NONE;
8888
8889*flags = REQ_NONE;
8890do {
8891 pcre_uint32 d;
8892 int dflags;
8893 int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8894 *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8895 const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
8896 TRUE);
8897 register pcre_uchar op = *scode;
8898
8899 switch(op)
8900 {
8901 default:
8902 return 0;
8903
8904 case OP_BRA:
8905 case OP_BRAPOS:
8906 case OP_CBRA:
8907 case OP_SCBRA:
8908 case OP_CBRAPOS:
8909 case OP_SCBRAPOS:
8910 case OP_ASSERT:
8911 case OP_ONCE:
8912 case OP_ONCE_NC:
8913 d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
8914 if (dflags < 0)
8915 return 0;
8916 if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0;
8917 break;
8918
8919 case OP_EXACT:
8920 scode += IMM2_SIZE;
8921 /* Fall through */
8922
8923 case OP_CHAR:
8924 case OP_PLUS:
8925 case OP_MINPLUS:
8926 case OP_POSPLUS:
8927 if (!inassert) return 0;
8928 if (cflags < 0) { c = scode[1]; cflags = 0; }
8929 else if (c != scode[1]) return 0;
8930 break;
8931
8932 case OP_EXACTI:
8933 scode += IMM2_SIZE;
8934 /* Fall through */
8935
8936 case OP_CHARI:
8937 case OP_PLUSI:
8938 case OP_MINPLUSI:
8939 case OP_POSPLUSI:
8940 if (!inassert) return 0;
8941 if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8942 else if (c != scode[1]) return 0;
8943 break;
8944 }
8945
8946 code += GET(code, 1);
8947 }
8948while (*code == OP_ALT);
8949
8950*flags = cflags;
8951return c;
8952}
8953
8954
8955
8956/*************************************************
8957* Add an entry to the name/number table *
8958*************************************************/
8959
8960/* This function is called between compiling passes to add an entry to the
8961name/number table, maintaining alphabetical order. Checking for permitted
8962and forbidden duplicates has already been done.
8963
8964Arguments:
8965 cd the compile data block
8966 name the name to add
8967 length the length of the name
8968 groupno the group number
8969
8970Returns: nothing
8971*/
8972
8973static void
8974add_name(compile_data *cd, const pcre_uchar *name, int length,
8975 unsigned int groupno)
8976{
8977int i;
8978pcre_uchar *slot = cd->name_table;
8979
8980for (i = 0; i < cd->names_found; i++)
8981 {
8982 int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
8983 if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8984 crc = -1; /* Current name is a substring */
8985
8986 /* Make space in the table and break the loop for an earlier name. For a
8987 duplicate or later name, carry on. We do this for duplicates so that in the
8988 simple case (when ?(| is not used) they are in order of their numbers. In all
8989 cases they are in the order in which they appear in the pattern. */
8990
8991 if (crc < 0)
8992 {
8993 memmove(slot + cd->name_entry_size, slot,
8994 IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
8995 break;
8996 }
8997
8998 /* Continue the loop for a later or duplicate name */
8999
9000 slot += cd->name_entry_size;
9001 }
9002
9003PUT2(slot, 0, groupno);
9004memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
9005slot[IMM2_SIZE + length] = 0;
9006cd->names_found++;
9007}
9008
9009
9010
9011/*************************************************
9012* Compile a Regular Expression *
9013*************************************************/
9014
9015/* This function takes a string and returns a pointer to a block of store
9016holding a compiled version of the expression. The original API for this
9017function had no error code return variable; it is retained for backwards
9018compatibility. The new function is given a new name.
9019
9020Arguments:
9021 pattern the regular expression
9022 options various option bits
9023 errorcodeptr pointer to error code variable (pcre_compile2() only)
9024 can be NULL if you don't want a code value
9025 errorptr pointer to pointer to error text
9026 erroroffset ptr offset in pattern where error was detected
9027 tables pointer to character tables or NULL
9028
9029Returns: pointer to compiled data block, or NULL on error,
9030 with errorptr and erroroffset set
9031*/
9032
9033#if defined COMPILE_PCRE8
9034PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
9035pcre_compile(const char *pattern, int options, const char **errorptr,
9036 int *erroroffset, const unsigned char *tables)
9037#elif defined COMPILE_PCRE16
9038PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
9039pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
9040 int *erroroffset, const unsigned char *tables)
9041#elif defined COMPILE_PCRE32
9042PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
9043pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
9044 int *erroroffset, const unsigned char *tables)
9045#endif
9046{
9047#if defined COMPILE_PCRE8
9048return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9049#elif defined COMPILE_PCRE16
9050return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9051#elif defined COMPILE_PCRE32
9052return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9053#endif
9054}
9055
9056
9057#if defined COMPILE_PCRE8
9058PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
9059pcre_compile2(const char *pattern, int options, int *errorcodeptr,
9060 const char **errorptr, int *erroroffset, const unsigned char *tables)
9061#elif defined COMPILE_PCRE16
9062PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
9063pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
9064 const char **errorptr, int *erroroffset, const unsigned char *tables)
9065#elif defined COMPILE_PCRE32
9066PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
9067pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
9068 const char **errorptr, int *erroroffset, const unsigned char *tables)
9069#endif
9070{
9071REAL_PCRE *re;
9072int length = 1; /* For final END opcode */
9073pcre_int32 firstcharflags, reqcharflags;
9074pcre_uint32 firstchar, reqchar;
9075pcre_uint32 limit_match = PCRE_UINT32_MAX;
9076pcre_uint32 limit_recursion = PCRE_UINT32_MAX;
9077int newline;
9078int errorcode = 0;
9079int skipatstart = 0;
9080BOOL utf;
9081BOOL never_utf = FALSE;
9082size_t size;
9083pcre_uchar *code;
9084const pcre_uchar *codestart;
9085const pcre_uchar *ptr;
9086compile_data compile_block;
9087compile_data *cd = &compile_block;
9088
9089/* This space is used for "compiling" into during the first phase, when we are
9090computing the amount of memory that is needed. Compiled items are thrown away
9091as soon as possible, so that a fairly large buffer should be sufficient for
9092this purpose. The same space is used in the second phase for remembering where
9093to fill in forward references to subpatterns. That may overflow, in which case
9094new memory is obtained from malloc(). */
9095
9096pcre_uchar cworkspace[COMPILE_WORK_SIZE];
9097
9098/* This vector is used for remembering name groups during the pre-compile. In a
9099similar way to cworkspace, it can be expanded using malloc() if necessary. */
9100
9101named_group named_groups[NAMED_GROUP_LIST_SIZE];
9102
9103/* Set this early so that early errors get offset 0. */
9104
9105ptr = (const pcre_uchar *)pattern;
9106
9107/* We can't pass back an error message if errorptr is NULL; I guess the best we
9108can do is just return NULL, but we can set a code value if there is a code
9109pointer. */
9110
9111if (errorptr == NULL)
9112 {
9113 if (errorcodeptr != NULL) *errorcodeptr = 99;
9114 return NULL;
9115 }
9116
9117*errorptr = NULL;
9118if (errorcodeptr != NULL) *errorcodeptr = ERR0;
9119
9120/* However, we can give a message for this error */
9121
9122if (erroroffset == NULL)
9123 {
9124 errorcode = ERR16;
9125 goto PCRE_EARLY_ERROR_RETURN2;
9126 }
9127
9128*erroroffset = 0;
9129
9130/* Set up pointers to the individual character tables */
9131
9132if (tables == NULL) tables = PRIV(default_tables);
9133cd->lcc = tables + lcc_offset;
9134cd->fcc = tables + fcc_offset;
9135cd->cbits = tables + cbits_offset;
9136cd->ctypes = tables + ctypes_offset;
9137
9138/* Check that all undefined public option bits are zero */
9139
9140if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
9141 {
9142 errorcode = ERR17;
9143 goto PCRE_EARLY_ERROR_RETURN;
9144 }
9145
9146/* If PCRE_NEVER_UTF is set, remember it. */
9147
9148if ((options & PCRE_NEVER_UTF) != 0) never_utf = TRUE;
9149
9150/* Check for global one-time settings at the start of the pattern, and remember
9151the offset for later. */
9152
9153cd->external_flags = 0; /* Initialize here for LIMIT_MATCH/RECURSION */
9154
9155while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9156 ptr[skipatstart+1] == CHAR_ASTERISK)
9157 {
9158 int newnl = 0;
9159 int newbsr = 0;
9160
9161/* For completeness and backward compatibility, (*UTFn) is supported in the
9162relevant libraries, but (*UTF) is generic and always supported. Note that
9163PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
9164
9165#ifdef COMPILE_PCRE8
9166 if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
9167 { skipatstart += 7; options |= PCRE_UTF8; continue; }
9168#endif
9169#ifdef COMPILE_PCRE16
9170 if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0)
9171 { skipatstart += 8; options |= PCRE_UTF16; continue; }
9172#endif
9173#ifdef COMPILE_PCRE32
9174 if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0)
9175 { skipatstart += 8; options |= PCRE_UTF32; continue; }
9176#endif
9177
9178 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0)
9179 { skipatstart += 6; options |= PCRE_UTF8; continue; }
9180 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
9181 { skipatstart += 6; options |= PCRE_UCP; continue; }
9182 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
9183 { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
9184 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
9185 { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
9186
9187 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_MATCH_EQ, 12) == 0)
9188 {
9189 pcre_uint32 c = 0;
9190 int p = skipatstart + 14;
9191 while (isdigit(ptr[p]))
9192 {
9193 if (c > PCRE_UINT32_MAX / 10 - 1) break; /* Integer overflow */
9194 c = c*10 + ptr[p++] - CHAR_0;
9195 }
9196 if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9197 if (c < limit_match)
9198 {
9199 limit_match = c;
9200 cd->external_flags |= PCRE_MLSET;
9201 }
9202 skipatstart = p;
9203 continue;
9204 }
9205
9206 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_RECURSION_EQ, 16) == 0)
9207 {
9208 pcre_uint32 c = 0;
9209 int p = skipatstart + 18;
9210 while (isdigit(ptr[p]))
9211 {
9212 if (c > PCRE_UINT32_MAX / 10 - 1) break; /* Integer overflow check */
9213 c = c*10 + ptr[p++] - CHAR_0;
9214 }
9215 if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9216 if (c < limit_recursion)
9217 {
9218 limit_recursion = c;
9219 cd->external_flags |= PCRE_RLSET;
9220 }
9221 skipatstart = p;
9222 continue;
9223 }
9224
9225 if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
9226 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
9227 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3) == 0)
9228 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
9229 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5) == 0)
9230 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
9231 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
9232 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
9233 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
9234 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
9235
9236 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
9237 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
9238 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
9239 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
9240
9241 if (newnl != 0)
9242 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
9243 else if (newbsr != 0)
9244 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
9245 else break;
9246 }
9247
9248/* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
9249utf = (options & PCRE_UTF8) != 0;
9250if (utf && never_utf)
9251 {
9252 errorcode = ERR78;
9253 goto PCRE_EARLY_ERROR_RETURN2;
9254 }
9255
9256/* Can't support UTF unless PCRE has been compiled to include the code. The
9257return of an error code from PRIV(valid_utf)() is a new feature, introduced in
9258release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
9259not used here. */
9260
9261#ifdef SUPPORT_UTF
9262if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
9263 (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
9264 {
9265#if defined COMPILE_PCRE8
9266 errorcode = ERR44;
9267#elif defined COMPILE_PCRE16
9268 errorcode = ERR74;
9269#elif defined COMPILE_PCRE32
9270 errorcode = ERR77;
9271#endif
9272 goto PCRE_EARLY_ERROR_RETURN2;
9273 }
9274#else
9275if (utf)
9276 {
9277 errorcode = ERR32;
9278 goto PCRE_EARLY_ERROR_RETURN;
9279 }
9280#endif
9281
9282/* Can't support UCP unless PCRE has been compiled to include the code. */
9283
9284#ifndef SUPPORT_UCP
9285if ((options & PCRE_UCP) != 0)
9286 {
9287 errorcode = ERR67;
9288 goto PCRE_EARLY_ERROR_RETURN;
9289 }
9290#endif
9291
9292/* Check validity of \R options. */
9293
9294if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
9295 (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
9296 {
9297 errorcode = ERR56;
9298 goto PCRE_EARLY_ERROR_RETURN;
9299 }
9300
9301/* Handle different types of newline. The three bits give seven cases. The
9302current code allows for fixed one- or two-byte sequences, plus "any" and
9303"anycrlf". */
9304
9305switch (options & PCRE_NEWLINE_BITS)
9306 {
9307 case 0: newline = NEWLINE; break; /* Build-time default */
9308 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
9309 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
9310 case PCRE_NEWLINE_CR+
9311 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
9312 case PCRE_NEWLINE_ANY: newline = -1; break;
9313 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
9314 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
9315 }
9316
9317if (newline == -2)
9318 {
9319 cd->nltype = NLTYPE_ANYCRLF;
9320 }
9321else if (newline < 0)
9322 {
9323 cd->nltype = NLTYPE_ANY;
9324 }
9325else
9326 {
9327 cd->nltype = NLTYPE_FIXED;
9328 if (newline > 255)
9329 {
9330 cd->nllen = 2;
9331 cd->nl[0] = (newline >> 8) & 255;
9332 cd->nl[1] = newline & 255;
9333 }
9334 else
9335 {
9336 cd->nllen = 1;
9337 cd->nl[0] = newline;
9338 }
9339 }
9340
9341/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9342references to help in deciding whether (.*) can be treated as anchored or not.
9343*/
9344
9345cd->top_backref = 0;
9346cd->backref_map = 0;
9347
9348/* Reflect pattern for debugging output */
9349
9350DPRINTF(("------------------------------------------------------------------\n"));
9351#ifdef PCRE_DEBUG
9352print_puchar(stdout, (PCRE_PUCHAR)pattern);
9353#endif
9354DPRINTF(("\n"));
9355
9356/* Pretend to compile the pattern while actually just accumulating the length
9357of memory required. This behaviour is triggered by passing a non-NULL final
9358argument to compile_regex(). We pass a block of workspace (cworkspace) for it
9359to compile parts of the pattern into; the compiled code is discarded when it is
9360no longer needed, so hopefully this workspace will never overflow, though there
9361is a test for its doing so. */
9362
9363cd->bracount = cd->final_bracount = 0;
9364cd->names_found = 0;
9365cd->name_entry_size = 0;
9366cd->name_table = NULL;
9367cd->dupnames = FALSE;
9368cd->dupgroups = FALSE;
9369cd->namedrefcount = 0;
9370cd->start_code = cworkspace;
9371cd->hwm = cworkspace;
9372cd->iscondassert = FALSE;
9373cd->start_workspace = cworkspace;
9374cd->workspace_size = COMPILE_WORK_SIZE;
9375cd->named_groups = named_groups;
9376cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
9377cd->start_pattern = (const pcre_uchar *)pattern;
9378cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
9379cd->req_varyopt = 0;
9380cd->parens_depth = 0;
9381cd->assert_depth = 0;
9382cd->max_lookbehind = 0;
9383cd->external_options = options;
9384cd->open_caps = NULL;
9385
9386/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
9387don't need to look at the result of the function here. The initial options have
9388been put into the cd block so that they can be changed if an option setting is
9389found within the regex right at the beginning. Bringing initial option settings
9390outside can help speed up starting point checks. */
9391
9392ptr += skipatstart;
9393code = cworkspace;
9394*code = OP_BRA;
9395
9396(void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9397 FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9398 cd, &length);
9399if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
9400
9401DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
9402 (int)(cd->hwm - cworkspace)));
9403
9404if (length > MAX_PATTERN_SIZE)
9405 {
9406 errorcode = ERR20;
9407 goto PCRE_EARLY_ERROR_RETURN;
9408 }
9409
9410/* Compute the size of the data block for storing the compiled pattern. Integer
9411overflow should no longer be possible because nowadays we limit the maximum
9412value of cd->names_found and cd->name_entry_size. */
9413
9414size = sizeof(REAL_PCRE) +
9415 (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
9416
9417/* Get the memory. */
9418
9419re = (REAL_PCRE *)(PUBL(malloc))(size);
9420if (re == NULL)
9421 {
9422 errorcode = ERR21;
9423 goto PCRE_EARLY_ERROR_RETURN;
9424 }
9425
9426/* Put in the magic number, and save the sizes, initial options, internal
9427flags, and character table pointer. NULL is used for the default character
9428tables. The nullpad field is at the end; it's there to help in the case when a
9429regex compiled on a system with 4-byte pointers is run on another with 8-byte
9430pointers. */
9431
9432re->magic_number = MAGIC_NUMBER;
9433re->size = (int)size;
9434re->options = cd->external_options;
9435re->flags = cd->external_flags;
9436re->limit_match = limit_match;
9437re->limit_recursion = limit_recursion;
9438re->first_char = 0;
9439re->req_char = 0;
9440re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
9441re->name_entry_size = cd->name_entry_size;
9442re->name_count = cd->names_found;
9443re->ref_count = 0;
9444re->tables = (tables == PRIV(default_tables))? NULL : tables;
9445re->nullpad = NULL;
9446#ifdef COMPILE_PCRE32
9447re->dummy = 0;
9448#else
9449re->dummy1 = re->dummy2 = re->dummy3 = 0;
9450#endif
9451
9452/* The starting points of the name/number translation table and of the code are
9453passed around in the compile data block. The start/end pattern and initial
9454options are already set from the pre-compile phase, as is the name_entry_size
9455field. Reset the bracket count and the names_found field. Also reset the hwm
9456field; this time it's used for remembering forward references to subpatterns.
9457*/
9458
9459cd->final_bracount = cd->bracount; /* Save for checking forward references */
9460cd->parens_depth = 0;
9461cd->assert_depth = 0;
9462cd->bracount = 0;
9463cd->max_lookbehind = 0;
9464cd->name_table = (pcre_uchar *)re + re->name_table_offset;
9465codestart = cd->name_table + re->name_entry_size * re->name_count;
9466cd->start_code = codestart;
9467cd->hwm = (pcre_uchar *)(cd->start_workspace);
9468cd->iscondassert = FALSE;
9469cd->req_varyopt = 0;
9470cd->had_accept = FALSE;
9471cd->had_pruneorskip = FALSE;
9472cd->check_lookbehind = FALSE;
9473cd->open_caps = NULL;
9474
9475/* If any named groups were found, create the name/number table from the list
9476created in the first pass. */
9477
9478if (cd->names_found > 0)
9479 {
9480 int i = cd->names_found;
9481 named_group *ng = cd->named_groups;
9482 cd->names_found = 0;
9483 for (; i > 0; i--, ng++)
9484 add_name(cd, ng->name, ng->length, ng->number);
9485 if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
9486 (PUBL(free))((void *)cd->named_groups);
9487 }
9488
9489/* Set up a starting, non-extracting bracket, then compile the expression. On
9490error, errorcode will be set non-zero, so we don't need to look at the result
9491of the function here. */
9492
9493ptr = (const pcre_uchar *)pattern + skipatstart;
9494code = (pcre_uchar *)codestart;
9495*code = OP_BRA;
9496(void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
9497 &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
9498re->top_bracket = cd->bracount;
9499re->top_backref = cd->top_backref;
9500re->max_lookbehind = cd->max_lookbehind;
9501re->flags = cd->external_flags | PCRE_MODE;
9502
9503if (cd->had_accept)
9504 {
9505 reqchar = 0; /* Must disable after (*ACCEPT) */
9506 reqcharflags = REQ_NONE;
9507 }
9508
9509/* If not reached end of pattern on success, there's an excess bracket. */
9510
9511if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22;
9512
9513/* Fill in the terminating state and check for disastrous overflow, but
9514if debugging, leave the test till after things are printed out. */
9515
9516*code++ = OP_END;
9517
9518#ifndef PCRE_DEBUG
9519if (code - codestart > length) errorcode = ERR23;
9520#endif
9521
9522#ifdef SUPPORT_VALGRIND
9523/* If the estimated length exceeds the really used length, mark the extra
9524allocated memory as unaddressable, so that any out-of-bound reads can be
9525detected. */
9526VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
9527#endif
9528
9529/* Fill in any forward references that are required. There may be repeated
9530references; optimize for them, as searching a large regex takes time. */
9531
9532if (cd->hwm > cd->start_workspace)
9533 {
9534 int prev_recno = -1;
9535 const pcre_uchar *groupptr = NULL;
9536 while (errorcode == 0 && cd->hwm > cd->start_workspace)
9537 {
9538 int offset, recno;
9539 cd->hwm -= LINK_SIZE;
9540 offset = GET(cd->hwm, 0);
9541
9542 /* Check that the hwm handling hasn't gone wrong. This whole area is
9543 rewritten in PCRE2 because there are some obscure cases. */
9544
9545 if (offset == 0 || codestart[offset-1] != OP_RECURSE)
9546 {
9547 errorcode = ERR10;
9548 break;
9549 }
9550
9551 recno = GET(codestart, offset);
9552 if (recno != prev_recno)
9553 {
9554 groupptr = PRIV(find_bracket)(codestart, utf, recno);
9555 prev_recno = recno;
9556 }
9557 if (groupptr == NULL) errorcode = ERR53;
9558 else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
9559 }
9560 }
9561
9562/* If the workspace had to be expanded, free the new memory. Set the pointer to
9563NULL to indicate that forward references have been filled in. */
9564
9565if (cd->workspace_size > COMPILE_WORK_SIZE)
9566 (PUBL(free))((void *)cd->start_workspace);
9567cd->start_workspace = NULL;
9568
9569/* Give an error if there's back reference to a non-existent capturing
9570subpattern. */
9571
9572if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
9573
9574/* Unless disabled, check whether any single character iterators can be
9575auto-possessified. The function overwrites the appropriate opcode values, so
9576the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9577used in this code because at least one compiler gives a warning about loss of
9578"const" attribute if the cast (pcre_uchar *)codestart is used directly in the
9579function call. */
9580
9581if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0)
9582 {
9583 pcre_uchar *temp = (pcre_uchar *)codestart;
9584 auto_possessify(temp, utf, cd);
9585 }
9586
9587/* If there were any lookbehind assertions that contained OP_RECURSE
9588(recursions or subroutine calls), a flag is set for them to be checked here,
9589because they may contain forward references. Actual recursions cannot be fixed
9590length, but subroutine calls can. It is done like this so that those without
9591OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
9592exceptional ones forgo this. We scan the pattern to check that they are fixed
9593length, and set their lengths. */
9594
9595if (errorcode == 0 && cd->check_lookbehind)
9596 {
9597 pcre_uchar *cc = (pcre_uchar *)codestart;
9598
9599 /* Loop, searching for OP_REVERSE items, and process those that do not have
9600 their length set. (Actually, it will also re-process any that have a length
9601 of zero, but that is a pathological case, and it does no harm.) When we find
9602 one, we temporarily terminate the branch it is in while we scan it. */
9603
9604 for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
9605 cc != NULL;
9606 cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
9607 {
9608 if (GET(cc, 1) == 0)
9609 {
9610 int fixed_length;
9611 pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
9612 int end_op = *be;
9613 *be = OP_END;
9614 fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
9615 cd, NULL);
9616 *be = end_op;
9617 DPRINTF(("fixed length = %d\n", fixed_length));
9618 if (fixed_length < 0)
9619 {
9620 errorcode = (fixed_length == -2)? ERR36 :
9621 (fixed_length == -4)? ERR70 : ERR25;
9622 break;
9623 }
9624 if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
9625 PUT(cc, 1, fixed_length);
9626 }
9627 cc += 1 + LINK_SIZE;
9628 }
9629 }
9630
9631/* Failed to compile, or error while post-processing */
9632
9633if (errorcode != 0)
9634 {
9635 (PUBL(free))(re);
9636 PCRE_EARLY_ERROR_RETURN:
9637 *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
9638 PCRE_EARLY_ERROR_RETURN2:
9639 *errorptr = find_error_text(errorcode);
9640 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
9641 return NULL;
9642 }
9643
9644/* If the anchored option was not passed, set the flag if we can determine that
9645the pattern is anchored by virtue of ^ characters or \A or anything else, such
9646as starting with non-atomic .* when DOTALL is set and there are no occurrences
9647of *PRUNE or *SKIP.
9648
9649Otherwise, if we know what the first byte has to be, save it, because that
9650speeds up unanchored matches no end. If not, see if we can set the
9651PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
9652start with ^. and also when all branches start with non-atomic .* for
9653non-DOTALL matches when *PRUNE and SKIP are not present. */
9654
9655if ((re->options & PCRE_ANCHORED) == 0)
9656 {
9657 if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
9658 else
9659 {
9660 if (firstcharflags < 0)
9661 firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
9662 if (firstcharflags >= 0) /* Remove caseless flag for non-caseable chars */
9663 {
9664#if defined COMPILE_PCRE8
9665 re->first_char = firstchar & 0xff;
9666#elif defined COMPILE_PCRE16
9667 re->first_char = firstchar & 0xffff;
9668#elif defined COMPILE_PCRE32
9669 re->first_char = firstchar;
9670#endif
9671 if ((firstcharflags & REQ_CASELESS) != 0)
9672 {
9673#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9674 /* We ignore non-ASCII first chars in 8 bit mode. */
9675 if (utf)
9676 {
9677 if (re->first_char < 128)
9678 {
9679 if (cd->fcc[re->first_char] != re->first_char)
9680 re->flags |= PCRE_FCH_CASELESS;
9681 }
9682 else if (UCD_OTHERCASE(re->first_char) != re->first_char)
9683 re->flags |= PCRE_FCH_CASELESS;
9684 }
9685 else
9686#endif
9687 if (MAX_255(re->first_char)
9688 && cd->fcc[re->first_char] != re->first_char)
9689 re->flags |= PCRE_FCH_CASELESS;
9690 }
9691
9692 re->flags |= PCRE_FIRSTSET;
9693 }
9694
9695 else if (is_startline(codestart, 0, cd, 0, FALSE)) re->flags |= PCRE_STARTLINE;
9696 }
9697 }
9698
9699/* For an anchored pattern, we use the "required byte" only if it follows a
9700variable length item in the regex. Remove the caseless flag for non-caseable
9701bytes. */
9702
9703if (reqcharflags >= 0 &&
9704 ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0))
9705 {
9706#if defined COMPILE_PCRE8
9707 re->req_char = reqchar & 0xff;
9708#elif defined COMPILE_PCRE16
9709 re->req_char = reqchar & 0xffff;
9710#elif defined COMPILE_PCRE32
9711 re->req_char = reqchar;
9712#endif
9713 if ((reqcharflags & REQ_CASELESS) != 0)
9714 {
9715#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9716 /* We ignore non-ASCII first chars in 8 bit mode. */
9717 if (utf)
9718 {
9719 if (re->req_char < 128)
9720 {
9721 if (cd->fcc[re->req_char] != re->req_char)
9722 re->flags |= PCRE_RCH_CASELESS;
9723 }
9724 else if (UCD_OTHERCASE(re->req_char) != re->req_char)
9725 re->flags |= PCRE_RCH_CASELESS;
9726 }
9727 else
9728#endif
9729 if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
9730 re->flags |= PCRE_RCH_CASELESS;
9731 }
9732
9733 re->flags |= PCRE_REQCHSET;
9734 }
9735
9736/* Print out the compiled data if debugging is enabled. This is never the
9737case when building a production library. */
9738
9739#ifdef PCRE_DEBUG
9740printf("Length = %d top_bracket = %d top_backref = %d\n",
9741 length, re->top_bracket, re->top_backref);
9742
9743printf("Options=%08x\n", re->options);
9744
9745if ((re->flags & PCRE_FIRSTSET) != 0)
9746 {
9747 pcre_uchar ch = re->first_char;
9748 const char *caseless =
9749 ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
9750 if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
9751 else printf("First char = \\x%02x%s\n", ch, caseless);
9752 }
9753
9754if ((re->flags & PCRE_REQCHSET) != 0)
9755 {
9756 pcre_uchar ch = re->req_char;
9757 const char *caseless =
9758 ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
9759 if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
9760 else printf("Req char = \\x%02x%s\n", ch, caseless);
9761 }
9762
9763#if defined COMPILE_PCRE8
9764pcre_printint((pcre *)re, stdout, TRUE);
9765#elif defined COMPILE_PCRE16
9766pcre16_printint((pcre *)re, stdout, TRUE);
9767#elif defined COMPILE_PCRE32
9768pcre32_printint((pcre *)re, stdout, TRUE);
9769#endif
9770
9771/* This check is done here in the debugging case so that the code that
9772was compiled can be seen. */
9773
9774if (code - codestart > length)
9775 {
9776 (PUBL(free))(re);
9777 *errorptr = find_error_text(ERR23);
9778 *erroroffset = ptr - (pcre_uchar *)pattern;
9779 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
9780 return NULL;
9781 }
9782#endif /* PCRE_DEBUG */
9783
9784/* Check for a pattern than can match an empty string, so that this information
9785can be provided to applications. */
9786
9787do
9788 {
9789 if (could_be_empty_branch(codestart, code, utf, cd, NULL))
9790 {
9791 re->flags |= PCRE_MATCH_EMPTY;
9792 break;
9793 }
9794 codestart += GET(codestart, 1);
9795 }
9796while (*codestart == OP_ALT);
9797
9798#if defined COMPILE_PCRE8
9799return (pcre *)re;
9800#elif defined COMPILE_PCRE16
9801return (pcre16 *)re;
9802#elif defined COMPILE_PCRE32
9803return (pcre32 *)re;
9804#endif
9805}
9806
9807/* End of pcre_compile.c */
9808