1/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2016 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40#pragma warning( disable : 4018) // '<' : signed/unsigned mismatch
41#pragma warning( disable : 4127) // conditional expression is constant
42#pragma warning( disable : 4244) // conversion from 'int' to 'unsigned short', possible loss of data
43#pragma warning( disable : 4701) // local variable 'othercase' may be used without having been initialized
44#pragma warning( disable : 4702) // unreachable code
45
46/* This module contains the external function pcre_compile(), along with
47supporting internal functions that are not used by other modules. */
48
49#include "pcre_config.h"
50
51#define NLBLOCK cd /* Block containing newline information */
52#define PSSTART start_pattern /* Field containing pattern start */
53#define PSEND end_pattern /* Field containing pattern end */
54
55#include "pcre_internal.h"
56
57
58/* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
59is also used by pcretest. PCRE_DEBUG is not defined when building a production
60library. We do not need to select pcre16_printint.c specially, because the
61COMPILE_PCREx macro will already be appropriately set. */
62
63#ifdef PCRE_DEBUG
64/* pcre_printint.c should not include any headers */
65#define PCRE_INCLUDED
66#include "pcre_printint.c"
67#undef PCRE_INCLUDED
68#endif
69
70
71/* Macro for setting individual bits in class bitmaps. */
72
73#define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
74
75/* Maximum length value to check against when making sure that the integer that
76holds the compiled pattern length does not overflow. We make it a bit less than
77INT_MAX to allow for adding in group terminating bytes, so that we don't have
78to check them every time. */
79
80#define OFLOW_MAX (INT_MAX - 20)
81
82/* Definitions to allow mutual recursion */
83
84static int
85 add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
86 const pcre_uint32 *, unsigned int);
87
88static BOOL
89 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
90 pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
91 compile_data *, int *);
92
93
94
95/*************************************************
96* Code parameters and static tables *
97*************************************************/
98
99/* This value specifies the size of stack workspace that is used during the
100first pre-compile phase that determines how much memory is required. The regex
101is partly compiled into this space, but the compiled parts are discarded as
102soon as they can be, so that hopefully there will never be an overrun. The code
103does, however, check for an overrun. The largest amount I've seen used is 218,
104so this number is very generous.
105
106The same workspace is used during the second, actual compile phase for
107remembering forward references to groups so that they can be filled in at the
108end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
109is 4 there is plenty of room for most patterns. However, the memory can get
110filled up by repetitions of forward references, for example patterns like
111/(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
112that the workspace is expanded using malloc() in this situation. The value
113below is therefore a minimum, and we put a maximum on it for safety. The
114minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
115kicks in at the same number of forward references in all cases. */
116
117#define COMPILE_WORK_SIZE (2048*LINK_SIZE)
118#define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
119
120/* This value determines the size of the initial vector that is used for
121remembering named groups during the pre-compile. It is allocated on the stack,
122but if it is too small, it is expanded using malloc(), in a similar way to the
123workspace. The value is the number of slots in the list. */
124
125#define NAMED_GROUP_LIST_SIZE 20
126
127/* The overrun tests check for a slightly smaller size so that they detect the
128overrun before it actually does run off the end of the data block. */
129
130#define WORK_SIZE_SAFETY_MARGIN (100)
131
132/* Private flags added to firstchar and reqchar. */
133
134#define REQ_CASELESS (1 << 0) /* Indicates caselessness */
135#define REQ_VARY (1 << 1) /* Reqchar followed non-literal item */
136/* Negative values for the firstchar and reqchar flags */
137#define REQ_UNSET (-2)
138#define REQ_NONE (-1)
139
140/* Repeated character flags. */
141
142#define UTF_LENGTH 0x10000000l /* The char contains its length. */
143
144/* Table for handling escaped characters in the range '0'-'z'. Positive returns
145are simple data values; negative values are for special things like \d and so
146on. Zero means further processing is needed (for things like \x), or the escape
147is invalid. */
148
149#ifndef EBCDIC
150
151/* This is the "normal" table for ASCII systems or for EBCDIC systems running
152in UTF-8 mode. */
153
154static const short int escapes[] = {
155 0, 0,
156 0, 0,
157 0, 0,
158 0, 0,
159 0, 0,
160 CHAR_COLON, CHAR_SEMICOLON,
161 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
162 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
163 CHAR_COMMERCIAL_AT, -ESC_A,
164 -ESC_B, -ESC_C,
165 -ESC_D, -ESC_E,
166 0, -ESC_G,
167 -ESC_H, 0,
168 0, -ESC_K,
169 0, 0,
170 -ESC_N, 0,
171 -ESC_P, -ESC_Q,
172 -ESC_R, -ESC_S,
173 0, 0,
174 -ESC_V, -ESC_W,
175 -ESC_X, 0,
176 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
177 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
178 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
179 CHAR_GRAVE_ACCENT, ESC_a,
180 -ESC_b, 0,
181 -ESC_d, ESC_e,
182 ESC_f, 0,
183 -ESC_h, 0,
184 0, -ESC_k,
185 0, 0,
186 ESC_n, 0,
187 -ESC_p, 0,
188 ESC_r, -ESC_s,
189 ESC_tee, 0,
190 -ESC_v, -ESC_w,
191 0, 0,
192 -ESC_z
193};
194
195#else
196
197/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
198
199static const short int escapes[] = {
200/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
201/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
202/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
203/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
204/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
205/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
206/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
207/* 80 */ 0, ESC_a, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
208/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
209/* 90 */ 0, 0, -ESC_k, 0, 0, ESC_n, 0, -ESC_p,
210/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
211/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
212/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
213/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
214/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
215/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
216/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
217/* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
218/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
219/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
220/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
221/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
222/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
223};
224
225/* We also need a table of characters that may follow \c in an EBCDIC
226environment for characters 0-31. */
227
228static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
229
230#endif
231
232
233/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
234searched linearly. Put all the names into a single string, in order to reduce
235the number of relocations when a shared library is dynamically linked. The
236string is built from string macros so that it works in UTF-8 mode on EBCDIC
237platforms. */
238
239typedef struct verbitem {
240 int len; /* Length of verb name */
241 int op; /* Op when no arg, or -1 if arg mandatory */
242 int op_arg; /* Op when arg present, or -1 if not allowed */
243} verbitem;
244
245static const char verbnames[] =
246 "\0" /* Empty name is a shorthand for MARK */
247 STRING_MARK0
248 STRING_ACCEPT0
249 STRING_COMMIT0
250 STRING_F0
251 STRING_FAIL0
252 STRING_PRUNE0
253 STRING_SKIP0
254 STRING_THEN;
255
256static const verbitem verbs[] = {
257 { 0, -1, OP_MARK },
258 { 4, -1, OP_MARK },
259 { 6, OP_ACCEPT, -1 },
260 { 6, OP_COMMIT, -1 },
261 { 1, OP_FAIL, -1 },
262 { 4, OP_FAIL, -1 },
263 { 5, OP_PRUNE, OP_PRUNE_ARG },
264 { 4, OP_SKIP, OP_SKIP_ARG },
265 { 4, OP_THEN, OP_THEN_ARG }
266};
267
268static const int verbcount = sizeof(verbs)/sizeof(verbitem);
269
270
271/* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
272another regex library. */
273
274static const pcre_uchar sub_start_of_word[] = {
275 CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
276 CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
277
278static const pcre_uchar sub_end_of_word[] = {
279 CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
280 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
281 CHAR_RIGHT_PARENTHESIS, '\0' };
282
283
284/* Tables of names of POSIX character classes and their lengths. The names are
285now all in a single string, to reduce the number of relocations when a shared
286library is dynamically loaded. The list of lengths is terminated by a zero
287length entry. The first three must be alpha, lower, upper, as this is assumed
288for handling case independence. The indices for graph, print, and punct are
289needed, so identify them. */
290
291static const char posix_names[] =
292 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
293 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
294 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
295 STRING_word0 STRING_xdigit;
296
297static const pcre_uint8 posix_name_lengths[] = {
298 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
299
300#define PC_GRAPH 8
301#define PC_PRINT 9
302#define PC_PUNCT 10
303
304
305/* Table of class bit maps for each POSIX class. Each class is formed from a
306base map, with an optional addition or removal of another map. Then, for some
307classes, there is some additional tweaking: for [:blank:] the vertical space
308characters are removed, and for [:alpha:] and [:alnum:] the underscore
309character is removed. The triples in the table consist of the base map offset,
310second map offset or -1 if no second map, and a non-negative value for map
311addition or a negative value for map subtraction (if there are two maps). The
312absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
313remove vertical space characters, 2 => remove underscore. */
314
315static const int posix_class_maps[] = {
316 cbit_word, cbit_digit, -2, /* alpha */
317 cbit_lower, -1, 0, /* lower */
318 cbit_upper, -1, 0, /* upper */
319 cbit_word, -1, 2, /* alnum - word without underscore */
320 cbit_print, cbit_cntrl, 0, /* ascii */
321 cbit_space, -1, 1, /* blank - a GNU extension */
322 cbit_cntrl, -1, 0, /* cntrl */
323 cbit_digit, -1, 0, /* digit */
324 cbit_graph, -1, 0, /* graph */
325 cbit_print, -1, 0, /* print */
326 cbit_punct, -1, 0, /* punct */
327 cbit_space, -1, 0, /* space */
328 cbit_word, -1, 0, /* word - a Perl extension */
329 cbit_xdigit,-1, 0 /* xdigit */
330};
331
332/* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
333Unicode property escapes. */
334
335#ifdef SUPPORT_UCP
336static const pcre_uchar string_PNd[] = {
337 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
338 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
339static const pcre_uchar string_pNd[] = {
340 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
341 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
342static const pcre_uchar string_PXsp[] = {
343 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
344 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
345static const pcre_uchar string_pXsp[] = {
346 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
347 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
348static const pcre_uchar string_PXwd[] = {
349 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
350 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
351static const pcre_uchar string_pXwd[] = {
352 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
353 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
354
355static const pcre_uchar *substitutes[] = {
356 string_PNd, /* \D */
357 string_pNd, /* \d */
358 string_PXsp, /* \S */ /* Xsp is Perl space, but from 8.34, Perl */
359 string_pXsp, /* \s */ /* space and POSIX space are the same. */
360 string_PXwd, /* \W */
361 string_pXwd /* \w */
362};
363
364/* The POSIX class substitutes must be in the order of the POSIX class names,
365defined above, and there are both positive and negative cases. NULL means no
366general substitute of a Unicode property escape (\p or \P). However, for some
367POSIX classes (e.g. graph, print, punct) a special property code is compiled
368directly. */
369
370static const pcre_uchar string_pL[] = {
371 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
372 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
373static const pcre_uchar string_pLl[] = {
374 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
375 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
376static const pcre_uchar string_pLu[] = {
377 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
378 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
379static const pcre_uchar string_pXan[] = {
380 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
381 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
382static const pcre_uchar string_h[] = {
383 CHAR_BACKSLASH, CHAR_h, '\0' };
384static const pcre_uchar string_pXps[] = {
385 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
386 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
387static const pcre_uchar string_PL[] = {
388 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
389 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
390static const pcre_uchar string_PLl[] = {
391 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
392 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
393static const pcre_uchar string_PLu[] = {
394 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
395 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
396static const pcre_uchar string_PXan[] = {
397 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
398 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
399static const pcre_uchar string_H[] = {
400 CHAR_BACKSLASH, CHAR_H, '\0' };
401static const pcre_uchar string_PXps[] = {
402 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
403 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
404
405static const pcre_uchar *posix_substitutes[] = {
406 string_pL, /* alpha */
407 string_pLl, /* lower */
408 string_pLu, /* upper */
409 string_pXan, /* alnum */
410 NULL, /* ascii */
411 string_h, /* blank */
412 NULL, /* cntrl */
413 string_pNd, /* digit */
414 NULL, /* graph */
415 NULL, /* print */
416 NULL, /* punct */
417 string_pXps, /* space */ /* Xps is POSIX space, but from 8.34 */
418 string_pXwd, /* word */ /* Perl and POSIX space are the same */
419 NULL, /* xdigit */
420 /* Negated cases */
421 string_PL, /* ^alpha */
422 string_PLl, /* ^lower */
423 string_PLu, /* ^upper */
424 string_PXan, /* ^alnum */
425 NULL, /* ^ascii */
426 string_H, /* ^blank */
427 NULL, /* ^cntrl */
428 string_PNd, /* ^digit */
429 NULL, /* ^graph */
430 NULL, /* ^print */
431 NULL, /* ^punct */
432 string_PXps, /* ^space */ /* Xps is POSIX space, but from 8.34 */
433 string_PXwd, /* ^word */ /* Perl and POSIX space are the same */
434 NULL /* ^xdigit */
435};
436#define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
437#endif
438
439#define STRING(a) # a
440#define XSTRING(s) STRING(s)
441
442/* The texts of compile-time error messages. These are "char *" because they
443are passed to the outside world. Do not ever re-use any error number, because
444they are documented. Always add a new error instead. Messages marked DEAD below
445are no longer used. This used to be a table of strings, but in order to reduce
446the number of relocations needed when a shared library is loaded dynamically,
447it is now one long string. We cannot use a table of offsets, because the
448lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
449simply count through to the one we want - this isn't a performance issue
450because these strings are used only when there is a compilation error.
451
452Each substring ends with \0 to insert a null character. This includes the final
453substring, so that the whole string ends with \0\0, which can be detected when
454counting through. */
455
456static const char error_texts[] =
457 "no error\0"
458 "\\ at end of pattern\0"
459 "\\c at end of pattern\0"
460 "unrecognized character follows \\\0"
461 "numbers out of order in {} quantifier\0"
462 /* 5 */
463 "number too big in {} quantifier\0"
464 "missing terminating ] for character class\0"
465 "invalid escape sequence in character class\0"
466 "range out of order in character class\0"
467 "nothing to repeat\0"
468 /* 10 */
469 "internal error: invalid forward reference offset\0"
470 "internal error: unexpected repeat\0"
471 "unrecognized character after (? or (?-\0"
472 "POSIX named classes are supported only within a class\0"
473 "missing )\0"
474 /* 15 */
475 "reference to non-existent subpattern\0"
476 "erroffset passed as NULL\0"
477 "unknown option bit(s) set\0"
478 "missing ) after comment\0"
479 "parentheses nested too deeply\0" /** DEAD **/
480 /* 20 */
481 "regular expression is too large\0"
482 "failed to get memory\0"
483 "unmatched parentheses\0"
484 "internal error: code overflow\0"
485 "unrecognized character after (?<\0"
486 /* 25 */
487 "lookbehind assertion is not fixed length\0"
488 "malformed number or name after (?(\0"
489 "conditional group contains more than two branches\0"
490 "assertion expected after (?( or (?(?C)\0"
491 "(?R or (?[+-]digits must be followed by )\0"
492 /* 30 */
493 "unknown POSIX class name\0"
494 "POSIX collating elements are not supported\0"
495 "this version of PCRE is compiled without UTF support\0"
496 "spare error\0" /** DEAD **/
497 "character value in \\x{} or \\o{} is too large\0"
498 /* 35 */
499 "invalid condition (?(0)\0"
500 "\\C not allowed in lookbehind assertion\0"
501 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
502 "number after (?C is > 255\0"
503 "closing ) for (?C expected\0"
504 /* 40 */
505 "recursive call could loop indefinitely\0"
506 "unrecognized character after (?P\0"
507 "syntax error in subpattern name (missing terminator)\0"
508 "two named subpatterns have the same name\0"
509 "invalid UTF-8 string\0"
510 /* 45 */
511 "support for \\P, \\p, and \\X has not been compiled\0"
512 "malformed \\P or \\p sequence\0"
513 "unknown property name after \\P or \\p\0"
514 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
515 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
516 /* 50 */
517 "repeated subpattern is too long\0" /** DEAD **/
518 "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
519 "internal error: overran compiling workspace\0"
520 "internal error: previously-checked referenced subpattern not found\0"
521 "DEFINE group contains more than one branch\0"
522 /* 55 */
523 "repeating a DEFINE group is not allowed\0" /** DEAD **/
524 "inconsistent NEWLINE options\0"
525 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
526 "a numbered reference must not be zero\0"
527 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
528 /* 60 */
529 "(*VERB) not recognized or malformed\0"
530 "number is too big\0"
531 "subpattern name expected\0"
532 "digit expected after (?+\0"
533 "] is an invalid data character in JavaScript compatibility mode\0"
534 /* 65 */
535 "different names for subpatterns of the same number are not allowed\0"
536 "(*MARK) must have an argument\0"
537 "this version of PCRE is not compiled with Unicode property support\0"
538#ifndef EBCDIC
539 "\\c must be followed by an ASCII character\0"
540#else
541 "\\c must be followed by a letter or one of [\\]^_?\0"
542#endif
543 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
544 /* 70 */
545 "internal error: unknown opcode in find_fixedlength()\0"
546 "\\N is not supported in a class\0"
547 "too many forward references\0"
548 "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
549 "invalid UTF-16 string\0"
550 /* 75 */
551 "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
552 "character value in \\u.... sequence is too large\0"
553 "invalid UTF-32 string\0"
554 "setting UTF is disabled by the application\0"
555 "non-hex character in \\x{} (closing brace missing?)\0"
556 /* 80 */
557 "non-octal character in \\o{} (closing brace missing?)\0"
558 "missing opening brace after \\o\0"
559 "parentheses are too deeply nested\0"
560 "invalid range in character class\0"
561 "group name must start with a non-digit\0"
562 /* 85 */
563 "parentheses are too deeply nested (stack check)\0"
564 "digits missing in \\x{} or \\o{}\0"
565 "regular expression is too complicated\0"
566 ;
567
568/* Table to identify digits and hex digits. This is used when compiling
569patterns. Note that the tables in chartables are dependent on the locale, and
570may mark arbitrary characters as digits - but the PCRE compiling code expects
571to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
572a private table here. It costs 256 bytes, but it is a lot faster than doing
573character value tests (at least in some simple cases I timed), and in some
574applications one wants PCRE to compile efficiently as well as match
575efficiently.
576
577For convenience, we use the same bit definitions as in chartables:
578
579 0x04 decimal digit
580 0x08 hexadecimal digit
581
582Then we can use ctype_digit and ctype_xdigit in the code. */
583
584/* Using a simple comparison for decimal numbers rather than a memory read
585is much faster, and the resulting code is simpler (the compiler turns it
586into a subtraction and unsigned comparison). */
587
588#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
589
590#ifndef EBCDIC
591
592/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
593UTF-8 mode. */
594
595static const pcre_uint8 digitab[] =
596 {
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
603 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
604 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
605 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
609 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
617 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
618 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
619 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
624 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
625 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
626 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
627 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
628 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
629
630#else
631
632/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
633
634static const pcre_uint8 digitab[] =
635 {
636 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
637 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
638 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
645 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
646 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
647 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
648 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
649 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
650 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
651 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
652 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
653 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
654 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
655 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
656 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
657 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
658 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
659 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
660 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
662 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
664 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
665 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
666 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
667 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
668
669static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
670 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
671 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
672 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
673 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
674 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
676 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
677 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
678 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
679 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
680 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
681 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
682 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
683 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
684 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
685 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
686 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
687 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
688 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
689 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
690 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
691 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
692 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
693 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
694 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
695 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
696 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
697 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
698 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
699 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
700 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
701 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
702#endif
703
704
705/* This table is used to check whether auto-possessification is possible
706between adjacent character-type opcodes. The left-hand (repeated) opcode is
707used to select the row, and the right-hand opcode is use to select the column.
708A value of 1 means that auto-possessification is OK. For example, the second
709value in the first row means that \D+\d can be turned into \D++\d.
710
711The Unicode property types (\P and \p) have to be present to fill out the table
712because of what their opcode values are, but the table values should always be
713zero because property types are handled separately in the code. The last four
714columns apply to items that cannot be repeated, so there is no need to have
715rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
716*not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
717
718#define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
719#define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
720
721static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
722/* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */
723 { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */
724 { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */
725 { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */
726 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */
727 { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */
728 { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */
729 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */
730 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */
731 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */
732 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */
733 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */
734 { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */
735 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */
736 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */
737 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */
738 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */
739 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */
740};
741
742
743/* This table is used to check whether auto-possessification is possible
744between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
745left-hand (repeated) opcode is used to select the row, and the right-hand
746opcode is used to select the column. The values are as follows:
747
748 0 Always return FALSE (never auto-possessify)
749 1 Character groups are distinct (possessify if both are OP_PROP)
750 2 Check character categories in the same group (general or particular)
751 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
752
753 4 Check left general category vs right particular category
754 5 Check right general category vs left particular category
755
756 6 Left alphanum vs right general category
757 7 Left space vs right general category
758 8 Left word vs right general category
759
760 9 Right alphanum vs left general category
761 10 Right space vs left general category
762 11 Right word vs left general category
763
764 12 Left alphanum vs right particular category
765 13 Left space vs right particular category
766 14 Left word vs right particular category
767
768 15 Right alphanum vs left particular category
769 16 Right space vs left particular category
770 17 Right word vs left particular category
771*/
772
773static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
774/* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
775 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
776 { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
777 { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
778 { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
779 { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
780 { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
781 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
782 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
783 { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
784 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
785 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
786};
787
788/* This table is used to check whether auto-possessification is possible
789between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
790specifies a general category and the other specifies a particular category. The
791row is selected by the general category and the column by the particular
792category. The value is 1 if the particular category is not part of the general
793category. */
794
795static const pcre_uint8 catposstab[7][30] = {
796/* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
797 { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */
798 { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */
799 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */
800 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
801 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */
802 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */
803 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */
804};
805
806/* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
807a general or particular category. The properties in each row are those
808that apply to the character set in question. Duplication means that a little
809unnecessary work is done when checking, but this keeps things much simpler
810because they can all use the same code. For more details see the comment where
811this table is used.
812
813Note: SPACE and PXSPACE used to be different because Perl excluded VT from
814"space", but from Perl 5.18 it's included, so both categories are treated the
815same here. */
816
817static const pcre_uint8 posspropstab[3][4] = {
818 { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */
819 { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */
820 { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
821};
822
823/* This table is used when converting repeating opcodes into possessified
824versions as a result of an explicit possessive quantifier such as ++. A zero
825value means there is no possessified version - in those cases the item in
826question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
827because all relevant opcodes are less than that. */
828
829static const pcre_uint8 opcode_possessify[] = {
830 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
831 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
832
833 0, /* NOTI */
834 OP_POSSTAR, 0, /* STAR, MINSTAR */
835 OP_POSPLUS, 0, /* PLUS, MINPLUS */
836 OP_POSQUERY, 0, /* QUERY, MINQUERY */
837 OP_POSUPTO, 0, /* UPTO, MINUPTO */
838 0, /* EXACT */
839 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
840
841 OP_POSSTARI, 0, /* STARI, MINSTARI */
842 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
843 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
844 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
845 0, /* EXACTI */
846 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
847
848 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
849 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
850 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
851 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
852 0, /* NOTEXACT */
853 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
854
855 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
856 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
857 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
858 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
859 0, /* NOTEXACTI */
860 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
861
862 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
863 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
864 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
865 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
866 0, /* TYPEEXACT */
867 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
868
869 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
870 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
871 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
872 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
873 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
874
875 0, 0, 0, /* CLASS, NCLASS, XCLASS */
876 0, 0, /* REF, REFI */
877 0, 0, /* DNREF, DNREFI */
878 0, 0 /* RECURSE, CALLOUT */
879};
880
881
882
883/*************************************************
884* Find an error text *
885*************************************************/
886
887/* The error texts are now all in one long string, to save on relocations. As
888some of the text is of unknown length, we can't use a table of offsets.
889Instead, just count through the strings. This is not a performance issue
890because it happens only when there has been a compilation error.
891
892Argument: the error number
893Returns: pointer to the error string
894*/
895
896static const char *
897find_error_text(int n)
898{
899const char *s = error_texts;
900for (; n > 0; n--)
901 {
902 while (*s++ != CHAR_NULL) {};
903 if (*s == CHAR_NULL) return "Error text not found (please report)";
904 }
905return s;
906}
907
908
909
910/*************************************************
911* Expand the workspace *
912*************************************************/
913
914/* This function is called during the second compiling phase, if the number of
915forward references fills the existing workspace, which is originally a block on
916the stack. A larger block is obtained from malloc() unless the ultimate limit
917has been reached or the increase will be rather small.
918
919Argument: pointer to the compile data block
920Returns: 0 if all went well, else an error number
921*/
922
923static int
924expand_workspace(compile_data *cd)
925{
926pcre_uchar *newspace;
927int newsize = cd->workspace_size * 2;
928
929if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
930if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
931 newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
932 return ERR72;
933
934newspace = (PUBL(malloc))(IN_UCHARS(newsize));
935if (newspace == NULL) return ERR21;
936memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
937cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
938if (cd->workspace_size > COMPILE_WORK_SIZE)
939 (PUBL(free))((void *)cd->start_workspace);
940cd->start_workspace = newspace;
941cd->workspace_size = newsize;
942return 0;
943}
944
945
946
947/*************************************************
948* Check for counted repeat *
949*************************************************/
950
951/* This function is called when a '{' is encountered in a place where it might
952start a quantifier. It looks ahead to see if it really is a quantifier or not.
953It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
954where the ddds are digits.
955
956Arguments:
957 p pointer to the first char after '{'
958
959Returns: TRUE or FALSE
960*/
961
962static BOOL
963is_counted_repeat(const pcre_uchar *p)
964{
965if (!IS_DIGIT(*p)) return FALSE;
966p++;
967while (IS_DIGIT(*p)) p++;
968if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
969
970if (*p++ != CHAR_COMMA) return FALSE;
971if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
972
973if (!IS_DIGIT(*p)) return FALSE;
974p++;
975while (IS_DIGIT(*p)) p++;
976
977return (*p == CHAR_RIGHT_CURLY_BRACKET);
978}
979
980
981
982/*************************************************
983* Handle escapes *
984*************************************************/
985
986/* This function is called when a \ has been encountered. It either returns a
987positive value for a simple escape such as \n, or 0 for a data character which
988will be placed in chptr. A backreference to group n is returned as negative n.
989When UTF-8 is enabled, a positive value greater than 255 may be returned in
990chptr. On entry, ptr is pointing at the \. On exit, it is on the final
991character of the escape sequence.
992
993Arguments:
994 ptrptr points to the pattern position pointer
995 chptr points to a returned data character
996 errorcodeptr points to the errorcode variable
997 bracount number of previous extracting brackets
998 options the options bits
999 isclass TRUE if inside a character class
1000
1001Returns: zero => a data character
1002 positive => a special escape sequence
1003 negative => a back reference
1004 on error, errorcodeptr is set
1005*/
1006
1007static int
1008check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
1009 int bracount, int options, BOOL isclass)
1010{
1011/* PCRE_UTF16 has the same value as PCRE_UTF8. */
1012BOOL utf = (options & PCRE_UTF8) != 0;
1013const pcre_uchar *ptr = *ptrptr + 1;
1014pcre_uint32 c;
1015int escape = 0;
1016int i;
1017
1018GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1019ptr--; /* Set pointer back to the last byte */
1020
1021/* If backslash is at the end of the pattern, it's an error. */
1022
1023if (c == CHAR_NULL) *errorcodeptr = ERR1;
1024
1025/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
1026in a table. A non-zero result is something that can be returned immediately.
1027Otherwise further processing may be required. */
1028
1029#ifndef EBCDIC /* ASCII/UTF-8 coding */
1030/* Not alphanumeric */
1031else if (c < CHAR_0 || c > CHAR_z) {}
1032else if ((i = escapes[c - CHAR_0]) != 0)
1033 { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1034
1035#else /* EBCDIC coding */
1036/* Not alphanumeric */
1037else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1038else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1039#endif
1040
1041/* Escapes that need further processing, or are illegal. */
1042
1043else
1044 {
1045 const pcre_uchar *oldptr;
1046 BOOL braced, negated, overflow;
1047 int s;
1048
1049 switch (c)
1050 {
1051 /* A number of Perl escapes are not handled by PCRE. We give an explicit
1052 error. */
1053
1054 case CHAR_l:
1055 case CHAR_L:
1056 *errorcodeptr = ERR37;
1057 break;
1058
1059 case CHAR_u:
1060 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1061 {
1062 /* In JavaScript, \u must be followed by four hexadecimal numbers.
1063 Otherwise it is a lowercase u letter. */
1064 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1065 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1066 && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1067 && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1068 {
1069 c = 0;
1070 for (i = 0; i < 4; ++i)
1071 {
1072 register pcre_uint32 cc = *(++ptr);
1073#ifndef EBCDIC /* ASCII/UTF-8 coding */
1074 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1075 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1076#else /* EBCDIC coding */
1077 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1078 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1079#endif
1080 }
1081
1082#if defined COMPILE_PCRE8
1083 if (c > (utf ? 0x10ffffU : 0xffU))
1084#elif defined COMPILE_PCRE16
1085 if (c > (utf ? 0x10ffffU : 0xffffU))
1086#elif defined COMPILE_PCRE32
1087 if (utf && c > 0x10ffffU)
1088#endif
1089 {
1090 *errorcodeptr = ERR76;
1091 }
1092 else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1093 }
1094 }
1095 else
1096 *errorcodeptr = ERR37;
1097 break;
1098
1099 case CHAR_U:
1100 /* In JavaScript, \U is an uppercase U letter. */
1101 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1102 break;
1103
1104 /* In a character class, \g is just a literal "g". Outside a character
1105 class, \g must be followed by one of a number of specific things:
1106
1107 (1) A number, either plain or braced. If positive, it is an absolute
1108 backreference. If negative, it is a relative backreference. This is a Perl
1109 5.10 feature.
1110
1111 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1112 is part of Perl's movement towards a unified syntax for back references. As
1113 this is synonymous with \k{name}, we fudge it up by pretending it really
1114 was \k.
1115
1116 (3) For Oniguruma compatibility we also support \g followed by a name or a
1117 number either in angle brackets or in single quotes. However, these are
1118 (possibly recursive) subroutine calls, _not_ backreferences. Just return
1119 the ESC_g code (cf \k). */
1120
1121 case CHAR_g:
1122 if (isclass) break;
1123 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1124 {
1125 escape = ESC_g;
1126 break;
1127 }
1128
1129 /* Handle the Perl-compatible cases */
1130
1131 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1132 {
1133 const pcre_uchar *p;
1134 for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1135 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1136 if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1137 {
1138 escape = ESC_k;
1139 break;
1140 }
1141 braced = TRUE;
1142 ptr++;
1143 }
1144 else braced = FALSE;
1145
1146 if (ptr[1] == CHAR_MINUS)
1147 {
1148 negated = TRUE;
1149 ptr++;
1150 }
1151 else negated = FALSE;
1152
1153 /* The integer range is limited by the machine's int representation. */
1154 s = 0;
1155 overflow = FALSE;
1156 while (IS_DIGIT(ptr[1]))
1157 {
1158 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1159 {
1160 overflow = TRUE;
1161 break;
1162 }
1163 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1164 }
1165 if (overflow) /* Integer overflow */
1166 {
1167 while (IS_DIGIT(ptr[1]))
1168 ptr++;
1169 *errorcodeptr = ERR61;
1170 break;
1171 }
1172
1173 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1174 {
1175 *errorcodeptr = ERR57;
1176 break;
1177 }
1178
1179 if (s == 0)
1180 {
1181 *errorcodeptr = ERR58;
1182 break;
1183 }
1184
1185 if (negated)
1186 {
1187 if (s > bracount)
1188 {
1189 *errorcodeptr = ERR15;
1190 break;
1191 }
1192 s = bracount - (s - 1);
1193 }
1194
1195 escape = -s;
1196 break;
1197
1198 /* The handling of escape sequences consisting of a string of digits
1199 starting with one that is not zero is not straightforward. Perl has changed
1200 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1201 recommended to avoid the ambiguities in the old syntax.
1202
1203 Outside a character class, the digits are read as a decimal number. If the
1204 number is less than 8 (used to be 10), or if there are that many previous
1205 extracting left brackets, then it is a back reference. Otherwise, up to
1206 three octal digits are read to form an escaped byte. Thus \123 is likely to
1207 be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1208 the octal value is greater than 377, the least significant 8 bits are
1209 taken. \8 and \9 are treated as the literal characters 8 and 9.
1210
1211 Inside a character class, \ followed by a digit is always either a literal
1212 8 or 9 or an octal number. */
1213
1214 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1215 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1216
1217 if (!isclass)
1218 {
1219 oldptr = ptr;
1220 /* The integer range is limited by the machine's int representation. */
1221 s = (int)(c -CHAR_0);
1222 overflow = FALSE;
1223 while (IS_DIGIT(ptr[1]))
1224 {
1225 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1226 {
1227 overflow = TRUE;
1228 break;
1229 }
1230 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1231 }
1232 if (overflow) /* Integer overflow */
1233 {
1234 while (IS_DIGIT(ptr[1]))
1235 ptr++;
1236 *errorcodeptr = ERR61;
1237 break;
1238 }
1239 if (s < 8 || s <= bracount) /* Check for back reference */
1240 {
1241 escape = -s;
1242 break;
1243 }
1244 ptr = oldptr; /* Put the pointer back and fall through */
1245 }
1246
1247 /* Handle a digit following \ when the number is not a back reference. If
1248 the first digit is 8 or 9, Perl used to generate a binary zero byte and
1249 then treat the digit as a following literal. At least by Perl 5.18 this
1250 changed so as not to insert the binary zero. */
1251
1252 if ((c = *ptr) >= CHAR_8) break;
1253
1254 /* Fall through with a digit less than 8 */
1255
1256 /* \0 always starts an octal number, but we may drop through to here with a
1257 larger first octal digit. The original code used just to take the least
1258 significant 8 bits of octal numbers (I think this is what early Perls used
1259 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1260 but no more than 3 octal digits. */
1261
1262 case CHAR_0:
1263 c -= CHAR_0;
1264 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1265 c = c * 8 + *(++ptr) - CHAR_0;
1266#ifdef COMPILE_PCRE8
1267 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1268#endif
1269 break;
1270
1271 /* \o is a relatively new Perl feature, supporting a more general way of
1272 specifying character codes in octal. The only supported form is \o{ddd}. */
1273
1274 case CHAR_o:
1275 if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1276 if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
1277 {
1278 ptr += 2;
1279 c = 0;
1280 overflow = FALSE;
1281 while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1282 {
1283 register pcre_uint32 cc = *ptr++;
1284 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1285#ifdef COMPILE_PCRE32
1286 if (c >= 0x20000000l) { overflow = TRUE; break; }
1287#endif
1288 c = (c << 3) + cc - CHAR_0 ;
1289#if defined COMPILE_PCRE8
1290 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1291#elif defined COMPILE_PCRE16
1292 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1293#elif defined COMPILE_PCRE32
1294 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1295#endif
1296 }
1297 if (overflow)
1298 {
1299 while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1300 *errorcodeptr = ERR34;
1301 }
1302 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1303 {
1304 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1305 }
1306 else *errorcodeptr = ERR80;
1307 }
1308 break;
1309
1310 /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1311 numbers. Otherwise it is a lowercase x letter. */
1312
1313 case CHAR_x:
1314 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1315 {
1316 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1317 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1318 {
1319 c = 0;
1320 for (i = 0; i < 2; ++i)
1321 {
1322 register pcre_uint32 cc = *(++ptr);
1323#ifndef EBCDIC /* ASCII/UTF-8 coding */
1324 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1325 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1326#else /* EBCDIC coding */
1327 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1328 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1329#endif
1330 }
1331 }
1332 } /* End JavaScript handling */
1333
1334 /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1335 greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1336 digits. If not, { used to be treated as a data character. However, Perl
1337 seems to read hex digits up to the first non-such, and ignore the rest, so
1338 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1339 now gives an error. */
1340
1341 else
1342 {
1343 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1344 {
1345 ptr += 2;
1346 if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1347 {
1348 *errorcodeptr = ERR86;
1349 break;
1350 }
1351 c = 0;
1352 overflow = FALSE;
1353 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1354 {
1355 register pcre_uint32 cc = *ptr++;
1356 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1357
1358#ifdef COMPILE_PCRE32
1359 if (c >= 0x10000000l) { overflow = TRUE; break; }
1360#endif
1361
1362#ifndef EBCDIC /* ASCII/UTF-8 coding */
1363 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1364 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1365#else /* EBCDIC coding */
1366 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1367 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1368#endif
1369
1370#if defined COMPILE_PCRE8
1371 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1372#elif defined COMPILE_PCRE16
1373 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1374#elif defined COMPILE_PCRE32
1375 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1376#endif
1377 }
1378
1379 if (overflow)
1380 {
1381 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1382 *errorcodeptr = ERR34;
1383 }
1384
1385 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1386 {
1387 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1388 }
1389
1390 /* If the sequence of hex digits does not end with '}', give an error.
1391 We used just to recognize this construct and fall through to the normal
1392 \x handling, but nowadays Perl gives an error, which seems much more
1393 sensible, so we do too. */
1394
1395 else *errorcodeptr = ERR79;
1396 } /* End of \x{} processing */
1397
1398 /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1399
1400 else
1401 {
1402 c = 0;
1403 while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1404 {
1405 pcre_uint32 cc; /* Some compilers don't like */
1406 cc = *(++ptr); /* ++ in initializers */
1407#ifndef EBCDIC /* ASCII/UTF-8 coding */
1408 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1409 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1410#else /* EBCDIC coding */
1411 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1412 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1413#endif
1414 }
1415 } /* End of \xdd handling */
1416 } /* End of Perl-style \x handling */
1417 break;
1418
1419 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1420 An error is given if the byte following \c is not an ASCII character. This
1421 coding is ASCII-specific, but then the whole concept of \cx is
1422 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1423
1424 case CHAR_c:
1425 c = *(++ptr);
1426 if (c == CHAR_NULL)
1427 {
1428 *errorcodeptr = ERR2;
1429 break;
1430 }
1431#ifndef EBCDIC /* ASCII/UTF-8 coding */
1432 if (c > 127) /* Excludes all non-ASCII in either mode */
1433 {
1434 *errorcodeptr = ERR68;
1435 break;
1436 }
1437 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1438 c ^= 0x40;
1439#else /* EBCDIC coding */
1440 if (c >= CHAR_a && c <= CHAR_z) c += 64;
1441 if (c == CHAR_QUESTION_MARK)
1442 c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
1443 else
1444 {
1445 for (i = 0; i < 32; i++)
1446 {
1447 if (c == ebcdic_escape_c[i]) break;
1448 }
1449 if (i < 32) c = i; else *errorcodeptr = ERR68;
1450 }
1451#endif
1452 break;
1453
1454 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1455 other alphanumeric following \ is an error if PCRE_EXTRA was set;
1456 otherwise, for Perl compatibility, it is a literal. This code looks a bit
1457 odd, but there used to be some cases other than the default, and there may
1458 be again in future, so I haven't "optimized" it. */
1459
1460 default:
1461 if ((options & PCRE_EXTRA) != 0) switch(c)
1462 {
1463 default:
1464 *errorcodeptr = ERR3;
1465 break;
1466 }
1467 break;
1468 }
1469 }
1470
1471/* Perl supports \N{name} for character names, as well as plain \N for "not
1472newline". PCRE does not support \N{name}. However, it does support
1473quantification such as \N{2,3}. */
1474
1475if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1476 !is_counted_repeat(ptr+2))
1477 *errorcodeptr = ERR37;
1478
1479/* If PCRE_UCP is set, we change the values for \d etc. */
1480
1481if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1482 escape += (ESC_DU - ESC_D);
1483
1484/* Set the pointer to the final character before returning. */
1485
1486*ptrptr = ptr;
1487*chptr = c;
1488return escape;
1489}
1490
1491
1492
1493#ifdef SUPPORT_UCP
1494/*************************************************
1495* Handle \P and \p *
1496*************************************************/
1497
1498/* This function is called after \P or \p has been encountered, provided that
1499PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1500pointing at the P or p. On exit, it is pointing at the final character of the
1501escape sequence.
1502
1503Argument:
1504 ptrptr points to the pattern position pointer
1505 negptr points to a boolean that is set TRUE for negation else FALSE
1506 ptypeptr points to an unsigned int that is set to the type value
1507 pdataptr points to an unsigned int that is set to the detailed property value
1508 errorcodeptr points to the error code variable
1509
1510Returns: TRUE if the type value was found, or FALSE for an invalid type
1511*/
1512
1513static BOOL
1514get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1515 unsigned int *pdataptr, int *errorcodeptr)
1516{
1517pcre_uchar c;
1518int i, bot, top;
1519const pcre_uchar *ptr = *ptrptr;
1520pcre_uchar name[32];
1521
1522c = *(++ptr);
1523if (c == CHAR_NULL) goto ERROR_RETURN;
1524
1525*negptr = FALSE;
1526
1527/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1528negation. */
1529
1530if (c == CHAR_LEFT_CURLY_BRACKET)
1531 {
1532 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1533 {
1534 *negptr = TRUE;
1535 ptr++;
1536 }
1537 for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1538 {
1539 c = *(++ptr);
1540 if (c == CHAR_NULL) goto ERROR_RETURN;
1541 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1542 name[i] = c;
1543 }
1544 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1545 name[i] = 0;
1546 }
1547
1548/* Otherwise there is just one following character */
1549
1550else
1551 {
1552 name[0] = c;
1553 name[1] = 0;
1554 }
1555
1556*ptrptr = ptr;
1557
1558/* Search for a recognized property name using binary chop */
1559
1560bot = 0;
1561top = PRIV(utt_size);
1562
1563while (bot < top)
1564 {
1565 int r;
1566 i = (bot + top) >> 1;
1567 r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1568 if (r == 0)
1569 {
1570 *ptypeptr = PRIV(utt)[i].type;
1571 *pdataptr = PRIV(utt)[i].value;
1572 return TRUE;
1573 }
1574 if (r > 0) bot = i + 1; else top = i;
1575 }
1576
1577*errorcodeptr = ERR47;
1578*ptrptr = ptr;
1579return FALSE;
1580
1581ERROR_RETURN:
1582*errorcodeptr = ERR46;
1583*ptrptr = ptr;
1584return FALSE;
1585}
1586#endif
1587
1588
1589
1590/*************************************************
1591* Read repeat counts *
1592*************************************************/
1593
1594/* Read an item of the form {n,m} and return the values. This is called only
1595after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1596so the syntax is guaranteed to be correct, but we need to check the values.
1597
1598Arguments:
1599 p pointer to first char after '{'
1600 minp pointer to int for min
1601 maxp pointer to int for max
1602 returned as -1 if no max
1603 errorcodeptr points to error code variable
1604
1605Returns: pointer to '}' on success;
1606 current ptr on error, with errorcodeptr set non-zero
1607*/
1608
1609static const pcre_uchar *
1610read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1611{
1612int min = 0;
1613int max = -1;
1614
1615while (IS_DIGIT(*p))
1616 {
1617 min = min * 10 + (int)(*p++ - CHAR_0);
1618 if (min > 65535)
1619 {
1620 *errorcodeptr = ERR5;
1621 return p;
1622 }
1623 }
1624
1625if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1626 {
1627 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1628 {
1629 max = 0;
1630 while(IS_DIGIT(*p))
1631 {
1632 max = max * 10 + (int)(*p++ - CHAR_0);
1633 if (max > 65535)
1634 {
1635 *errorcodeptr = ERR5;
1636 return p;
1637 }
1638 }
1639 if (max < min)
1640 {
1641 *errorcodeptr = ERR4;
1642 return p;
1643 }
1644 }
1645 }
1646
1647*minp = min;
1648*maxp = max;
1649return p;
1650}
1651
1652
1653
1654/*************************************************
1655* Find first significant op code *
1656*************************************************/
1657
1658/* This is called by several functions that scan a compiled expression looking
1659for a fixed first character, or an anchoring op code etc. It skips over things
1660that do not influence this. For some calls, it makes sense to skip negative
1661forward and all backward assertions, and also the \b assertion; for others it
1662does not.
1663
1664Arguments:
1665 code pointer to the start of the group
1666 skipassert TRUE if certain assertions are to be skipped
1667
1668Returns: pointer to the first significant opcode
1669*/
1670
1671static const pcre_uchar*
1672first_significant_code(const pcre_uchar *code, BOOL skipassert)
1673{
1674for (;;)
1675 {
1676 switch ((int)*code)
1677 {
1678 case OP_ASSERT_NOT:
1679 case OP_ASSERTBACK:
1680 case OP_ASSERTBACK_NOT:
1681 if (!skipassert) return code;
1682 do code += GET(code, 1); while (*code == OP_ALT);
1683 code += PRIV(OP_lengths)[*code];
1684 break;
1685
1686 case OP_WORD_BOUNDARY:
1687 case OP_NOT_WORD_BOUNDARY:
1688 if (!skipassert) return code;
1689 /* Fall through */
1690
1691 case OP_CALLOUT:
1692 case OP_CREF:
1693 case OP_DNCREF:
1694 case OP_RREF:
1695 case OP_DNRREF:
1696 case OP_DEF:
1697 code += PRIV(OP_lengths)[*code];
1698 break;
1699
1700 default:
1701 return code;
1702 }
1703 }
1704/* Control never reaches here */
1705}
1706
1707
1708
1709/*************************************************
1710* Find the fixed length of a branch *
1711*************************************************/
1712
1713/* Scan a branch and compute the fixed length of subject that will match it,
1714if the length is fixed. This is needed for dealing with backward assertions.
1715In UTF8 mode, the result is in characters rather than bytes. The branch is
1716temporarily terminated with OP_END when this function is called.
1717
1718This function is called when a backward assertion is encountered, so that if it
1719fails, the error message can point to the correct place in the pattern.
1720However, we cannot do this when the assertion contains subroutine calls,
1721because they can be forward references. We solve this by remembering this case
1722and doing the check at the end; a flag specifies which mode we are running in.
1723
1724Arguments:
1725 code points to the start of the pattern (the bracket)
1726 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1727 atend TRUE if called when the pattern is complete
1728 cd the "compile data" structure
1729 recurses chain of recurse_check to catch mutual recursion
1730
1731Returns: the fixed length,
1732 or -1 if there is no fixed length,
1733 or -2 if \C was encountered (in UTF-8 mode only)
1734 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1735 or -4 if an unknown opcode was encountered (internal error)
1736*/
1737
1738static int
1739find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
1740 recurse_check *recurses)
1741{
1742int length = -1;
1743recurse_check this_recurse;
1744register int branchlength = 0;
1745register pcre_uchar *cc = code + 1 + LINK_SIZE;
1746
1747/* Scan along the opcodes for this branch. If we get to the end of the
1748branch, check the length against that of the other branches. */
1749
1750for (;;)
1751 {
1752 int d;
1753 pcre_uchar *ce, *cs;
1754 register pcre_uchar op = *cc;
1755
1756 switch (op)
1757 {
1758 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1759 OP_BRA (normal non-capturing bracket) because the other variants of these
1760 opcodes are all concerned with unlimited repeated groups, which of course
1761 are not of fixed length. */
1762
1763 case OP_CBRA:
1764 case OP_BRA:
1765 case OP_ONCE:
1766 case OP_ONCE_NC:
1767 case OP_COND:
1768 d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
1769 recurses);
1770 if (d < 0) return d;
1771 branchlength += d;
1772 do cc += GET(cc, 1); while (*cc == OP_ALT);
1773 cc += 1 + LINK_SIZE;
1774 break;
1775
1776 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1777 If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1778 an ALT. If it is END it's the end of the outer call. All can be handled by
1779 the same code. Note that we must not include the OP_KETRxxx opcodes here,
1780 because they all imply an unlimited repeat. */
1781
1782 case OP_ALT:
1783 case OP_KET:
1784 case OP_END:
1785 case OP_ACCEPT:
1786 case OP_ASSERT_ACCEPT:
1787 if (length < 0) length = branchlength;
1788 else if (length != branchlength) return -1;
1789 if (*cc != OP_ALT) return length;
1790 cc += 1 + LINK_SIZE;
1791 branchlength = 0;
1792 break;
1793
1794 /* A true recursion implies not fixed length, but a subroutine call may
1795 be OK. If the subroutine is a forward reference, we can't deal with
1796 it until the end of the pattern, so return -3. */
1797
1798 case OP_RECURSE:
1799 if (!atend) return -3;
1800 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1801 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1802 if (cc > cs && cc < ce) return -1; /* Recursion */
1803 else /* Check for mutual recursion */
1804 {
1805 recurse_check *r = recurses;
1806 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
1807 if (r != NULL) return -1; /* Mutual recursion */
1808 }
1809 this_recurse.prev = recurses;
1810 this_recurse.group = cs;
1811 d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
1812 if (d < 0) return d;
1813 branchlength += d;
1814 cc += 1 + LINK_SIZE;
1815 break;
1816
1817 /* Skip over assertive subpatterns */
1818
1819 case OP_ASSERT:
1820 case OP_ASSERT_NOT:
1821 case OP_ASSERTBACK:
1822 case OP_ASSERTBACK_NOT:
1823 do cc += GET(cc, 1); while (*cc == OP_ALT);
1824 cc += 1 + LINK_SIZE;
1825 break;
1826
1827 /* Skip over things that don't match chars */
1828
1829 case OP_MARK:
1830 case OP_PRUNE_ARG:
1831 case OP_SKIP_ARG:
1832 case OP_THEN_ARG:
1833 cc += cc[1] + PRIV(OP_lengths)[*cc];
1834 break;
1835
1836 case OP_CALLOUT:
1837 case OP_CIRC:
1838 case OP_CIRCM:
1839 case OP_CLOSE:
1840 case OP_COMMIT:
1841 case OP_CREF:
1842 case OP_DEF:
1843 case OP_DNCREF:
1844 case OP_DNRREF:
1845 case OP_DOLL:
1846 case OP_DOLLM:
1847 case OP_EOD:
1848 case OP_EODN:
1849 case OP_FAIL:
1850 case OP_NOT_WORD_BOUNDARY:
1851 case OP_PRUNE:
1852 case OP_REVERSE:
1853 case OP_RREF:
1854 case OP_SET_SOM:
1855 case OP_SKIP:
1856 case OP_SOD:
1857 case OP_SOM:
1858 case OP_THEN:
1859 case OP_WORD_BOUNDARY:
1860 cc += PRIV(OP_lengths)[*cc];
1861 break;
1862
1863 /* Handle literal characters */
1864
1865 case OP_CHAR:
1866 case OP_CHARI:
1867 case OP_NOT:
1868 case OP_NOTI:
1869 branchlength++;
1870 cc += 2;
1871#ifdef SUPPORT_UTF
1872 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1873#endif
1874 break;
1875
1876 /* Handle exact repetitions. The count is already in characters, but we
1877 need to skip over a multibyte character in UTF8 mode. */
1878
1879 case OP_EXACT:
1880 case OP_EXACTI:
1881 case OP_NOTEXACT:
1882 case OP_NOTEXACTI:
1883 branchlength += (int)GET2(cc,1);
1884 cc += 2 + IMM2_SIZE;
1885#ifdef SUPPORT_UTF
1886 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1887#endif
1888 break;
1889
1890 case OP_TYPEEXACT:
1891 branchlength += GET2(cc,1);
1892 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1893 cc += 2;
1894 cc += 1 + IMM2_SIZE + 1;
1895 break;
1896
1897 /* Handle single-char matchers */
1898
1899 case OP_PROP:
1900 case OP_NOTPROP:
1901 cc += 2;
1902 /* Fall through */
1903
1904 case OP_HSPACE:
1905 case OP_VSPACE:
1906 case OP_NOT_HSPACE:
1907 case OP_NOT_VSPACE:
1908 case OP_NOT_DIGIT:
1909 case OP_DIGIT:
1910 case OP_NOT_WHITESPACE:
1911 case OP_WHITESPACE:
1912 case OP_NOT_WORDCHAR:
1913 case OP_WORDCHAR:
1914 case OP_ANY:
1915 case OP_ALLANY:
1916 branchlength++;
1917 cc++;
1918 break;
1919
1920 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1921 otherwise \C is coded as OP_ALLANY. */
1922
1923 case OP_ANYBYTE:
1924 return -2;
1925
1926 /* Check a class for variable quantification */
1927
1928 case OP_CLASS:
1929 case OP_NCLASS:
1930#if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1931 case OP_XCLASS:
1932 /* The original code caused an unsigned overflow in 64 bit systems,
1933 so now we use a conditional statement. */
1934 if (op == OP_XCLASS)
1935 cc += GET(cc, 1);
1936 else
1937 cc += PRIV(OP_lengths)[OP_CLASS];
1938#else
1939 cc += PRIV(OP_lengths)[OP_CLASS];
1940#endif
1941
1942 switch (*cc)
1943 {
1944 case OP_CRSTAR:
1945 case OP_CRMINSTAR:
1946 case OP_CRPLUS:
1947 case OP_CRMINPLUS:
1948 case OP_CRQUERY:
1949 case OP_CRMINQUERY:
1950 case OP_CRPOSSTAR:
1951 case OP_CRPOSPLUS:
1952 case OP_CRPOSQUERY:
1953 return -1;
1954
1955 case OP_CRRANGE:
1956 case OP_CRMINRANGE:
1957 case OP_CRPOSRANGE:
1958 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1959 branchlength += (int)GET2(cc,1);
1960 cc += 1 + 2 * IMM2_SIZE;
1961 break;
1962
1963 default:
1964 branchlength++;
1965 }
1966 break;
1967
1968 /* Anything else is variable length */
1969
1970 case OP_ANYNL:
1971 case OP_BRAMINZERO:
1972 case OP_BRAPOS:
1973 case OP_BRAPOSZERO:
1974 case OP_BRAZERO:
1975 case OP_CBRAPOS:
1976 case OP_EXTUNI:
1977 case OP_KETRMAX:
1978 case OP_KETRMIN:
1979 case OP_KETRPOS:
1980 case OP_MINPLUS:
1981 case OP_MINPLUSI:
1982 case OP_MINQUERY:
1983 case OP_MINQUERYI:
1984 case OP_MINSTAR:
1985 case OP_MINSTARI:
1986 case OP_MINUPTO:
1987 case OP_MINUPTOI:
1988 case OP_NOTMINPLUS:
1989 case OP_NOTMINPLUSI:
1990 case OP_NOTMINQUERY:
1991 case OP_NOTMINQUERYI:
1992 case OP_NOTMINSTAR:
1993 case OP_NOTMINSTARI:
1994 case OP_NOTMINUPTO:
1995 case OP_NOTMINUPTOI:
1996 case OP_NOTPLUS:
1997 case OP_NOTPLUSI:
1998 case OP_NOTPOSPLUS:
1999 case OP_NOTPOSPLUSI:
2000 case OP_NOTPOSQUERY:
2001 case OP_NOTPOSQUERYI:
2002 case OP_NOTPOSSTAR:
2003 case OP_NOTPOSSTARI:
2004 case OP_NOTPOSUPTO:
2005 case OP_NOTPOSUPTOI:
2006 case OP_NOTQUERY:
2007 case OP_NOTQUERYI:
2008 case OP_NOTSTAR:
2009 case OP_NOTSTARI:
2010 case OP_NOTUPTO:
2011 case OP_NOTUPTOI:
2012 case OP_PLUS:
2013 case OP_PLUSI:
2014 case OP_POSPLUS:
2015 case OP_POSPLUSI:
2016 case OP_POSQUERY:
2017 case OP_POSQUERYI:
2018 case OP_POSSTAR:
2019 case OP_POSSTARI:
2020 case OP_POSUPTO:
2021 case OP_POSUPTOI:
2022 case OP_QUERY:
2023 case OP_QUERYI:
2024 case OP_REF:
2025 case OP_REFI:
2026 case OP_DNREF:
2027 case OP_DNREFI:
2028 case OP_SBRA:
2029 case OP_SBRAPOS:
2030 case OP_SCBRA:
2031 case OP_SCBRAPOS:
2032 case OP_SCOND:
2033 case OP_SKIPZERO:
2034 case OP_STAR:
2035 case OP_STARI:
2036 case OP_TYPEMINPLUS:
2037 case OP_TYPEMINQUERY:
2038 case OP_TYPEMINSTAR:
2039 case OP_TYPEMINUPTO:
2040 case OP_TYPEPLUS:
2041 case OP_TYPEPOSPLUS:
2042 case OP_TYPEPOSQUERY:
2043 case OP_TYPEPOSSTAR:
2044 case OP_TYPEPOSUPTO:
2045 case OP_TYPEQUERY:
2046 case OP_TYPESTAR:
2047 case OP_TYPEUPTO:
2048 case OP_UPTO:
2049 case OP_UPTOI:
2050 return -1;
2051
2052 /* Catch unrecognized opcodes so that when new ones are added they
2053 are not forgotten, as has happened in the past. */
2054
2055 default:
2056 return -4;
2057 }
2058 }
2059/* Control never gets here */
2060}
2061
2062
2063
2064/*************************************************
2065* Scan compiled regex for specific bracket *
2066*************************************************/
2067
2068/* This little function scans through a compiled pattern until it finds a
2069capturing bracket with the given number, or, if the number is negative, an
2070instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2071so that it can be called from pcre_study() when finding the minimum matching
2072length.
2073
2074Arguments:
2075 code points to start of expression
2076 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2077 number the required bracket number or negative to find a lookbehind
2078
2079Returns: pointer to the opcode for the bracket, or NULL if not found
2080*/
2081
2082const pcre_uchar *
2083PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2084{
2085for (;;)
2086 {
2087 register pcre_uchar c = *code;
2088
2089 if (c == OP_END) return NULL;
2090
2091 /* XCLASS is used for classes that cannot be represented just by a bit
2092 map. This includes negated single high-valued characters. The length in
2093 the table is zero; the actual length is stored in the compiled code. */
2094
2095 if (c == OP_XCLASS) code += GET(code, 1);
2096
2097 /* Handle recursion */
2098
2099 else if (c == OP_REVERSE)
2100 {
2101 if (number < 0) return (pcre_uchar *)code;
2102 code += PRIV(OP_lengths)[c];
2103 }
2104
2105 /* Handle capturing bracket */
2106
2107 else if (c == OP_CBRA || c == OP_SCBRA ||
2108 c == OP_CBRAPOS || c == OP_SCBRAPOS)
2109 {
2110 int n = (int)GET2(code, 1+LINK_SIZE);
2111 if (n == number) return (pcre_uchar *)code;
2112 code += PRIV(OP_lengths)[c];
2113 }
2114
2115 /* Otherwise, we can get the item's length from the table, except that for
2116 repeated character types, we have to test for \p and \P, which have an extra
2117 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2118 must add in its length. */
2119
2120 else
2121 {
2122 switch(c)
2123 {
2124 case OP_TYPESTAR:
2125 case OP_TYPEMINSTAR:
2126 case OP_TYPEPLUS:
2127 case OP_TYPEMINPLUS:
2128 case OP_TYPEQUERY:
2129 case OP_TYPEMINQUERY:
2130 case OP_TYPEPOSSTAR:
2131 case OP_TYPEPOSPLUS:
2132 case OP_TYPEPOSQUERY:
2133 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2134 break;
2135
2136 case OP_TYPEUPTO:
2137 case OP_TYPEMINUPTO:
2138 case OP_TYPEEXACT:
2139 case OP_TYPEPOSUPTO:
2140 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2141 code += 2;
2142 break;
2143
2144 case OP_MARK:
2145 case OP_PRUNE_ARG:
2146 case OP_SKIP_ARG:
2147 case OP_THEN_ARG:
2148 code += code[1];
2149 break;
2150 }
2151
2152 /* Add in the fixed length from the table */
2153
2154 code += PRIV(OP_lengths)[c];
2155
2156 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2157 a multi-byte character. The length in the table is a minimum, so we have to
2158 arrange to skip the extra bytes. */
2159
2160#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2161 if (utf) switch(c)
2162 {
2163 case OP_CHAR:
2164 case OP_CHARI:
2165 case OP_NOT:
2166 case OP_NOTI:
2167 case OP_EXACT:
2168 case OP_EXACTI:
2169 case OP_NOTEXACT:
2170 case OP_NOTEXACTI:
2171 case OP_UPTO:
2172 case OP_UPTOI:
2173 case OP_NOTUPTO:
2174 case OP_NOTUPTOI:
2175 case OP_MINUPTO:
2176 case OP_MINUPTOI:
2177 case OP_NOTMINUPTO:
2178 case OP_NOTMINUPTOI:
2179 case OP_POSUPTO:
2180 case OP_POSUPTOI:
2181 case OP_NOTPOSUPTO:
2182 case OP_NOTPOSUPTOI:
2183 case OP_STAR:
2184 case OP_STARI:
2185 case OP_NOTSTAR:
2186 case OP_NOTSTARI:
2187 case OP_MINSTAR:
2188 case OP_MINSTARI:
2189 case OP_NOTMINSTAR:
2190 case OP_NOTMINSTARI:
2191 case OP_POSSTAR:
2192 case OP_POSSTARI:
2193 case OP_NOTPOSSTAR:
2194 case OP_NOTPOSSTARI:
2195 case OP_PLUS:
2196 case OP_PLUSI:
2197 case OP_NOTPLUS:
2198 case OP_NOTPLUSI:
2199 case OP_MINPLUS:
2200 case OP_MINPLUSI:
2201 case OP_NOTMINPLUS:
2202 case OP_NOTMINPLUSI:
2203 case OP_POSPLUS:
2204 case OP_POSPLUSI:
2205 case OP_NOTPOSPLUS:
2206 case OP_NOTPOSPLUSI:
2207 case OP_QUERY:
2208 case OP_QUERYI:
2209 case OP_NOTQUERY:
2210 case OP_NOTQUERYI:
2211 case OP_MINQUERY:
2212 case OP_MINQUERYI:
2213 case OP_NOTMINQUERY:
2214 case OP_NOTMINQUERYI:
2215 case OP_POSQUERY:
2216 case OP_POSQUERYI:
2217 case OP_NOTPOSQUERY:
2218 case OP_NOTPOSQUERYI:
2219 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2220 break;
2221 }
2222#else
2223 (void)(utf); /* Keep compiler happy by referencing function argument */
2224#endif
2225 }
2226 }
2227}
2228
2229
2230
2231/*************************************************
2232* Scan compiled regex for recursion reference *
2233*************************************************/
2234
2235/* This little function scans through a compiled pattern until it finds an
2236instance of OP_RECURSE.
2237
2238Arguments:
2239 code points to start of expression
2240 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2241
2242Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2243*/
2244
2245static const pcre_uchar *
2246find_recurse(const pcre_uchar *code, BOOL utf)
2247{
2248for (;;)
2249 {
2250 register pcre_uchar c = *code;
2251 if (c == OP_END) return NULL;
2252 if (c == OP_RECURSE) return code;
2253
2254 /* XCLASS is used for classes that cannot be represented just by a bit
2255 map. This includes negated single high-valued characters. The length in
2256 the table is zero; the actual length is stored in the compiled code. */
2257
2258 if (c == OP_XCLASS) code += GET(code, 1);
2259
2260 /* Otherwise, we can get the item's length from the table, except that for
2261 repeated character types, we have to test for \p and \P, which have an extra
2262 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2263 must add in its length. */
2264
2265 else
2266 {
2267 switch(c)
2268 {
2269 case OP_TYPESTAR:
2270 case OP_TYPEMINSTAR:
2271 case OP_TYPEPLUS:
2272 case OP_TYPEMINPLUS:
2273 case OP_TYPEQUERY:
2274 case OP_TYPEMINQUERY:
2275 case OP_TYPEPOSSTAR:
2276 case OP_TYPEPOSPLUS:
2277 case OP_TYPEPOSQUERY:
2278 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2279 break;
2280
2281 case OP_TYPEPOSUPTO:
2282 case OP_TYPEUPTO:
2283 case OP_TYPEMINUPTO:
2284 case OP_TYPEEXACT:
2285 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2286 code += 2;
2287 break;
2288
2289 case OP_MARK:
2290 case OP_PRUNE_ARG:
2291 case OP_SKIP_ARG:
2292 case OP_THEN_ARG:
2293 code += code[1];
2294 break;
2295 }
2296
2297 /* Add in the fixed length from the table */
2298
2299 code += PRIV(OP_lengths)[c];
2300
2301 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2302 by a multi-byte character. The length in the table is a minimum, so we have
2303 to arrange to skip the extra bytes. */
2304
2305#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2306 if (utf) switch(c)
2307 {
2308 case OP_CHAR:
2309 case OP_CHARI:
2310 case OP_NOT:
2311 case OP_NOTI:
2312 case OP_EXACT:
2313 case OP_EXACTI:
2314 case OP_NOTEXACT:
2315 case OP_NOTEXACTI:
2316 case OP_UPTO:
2317 case OP_UPTOI:
2318 case OP_NOTUPTO:
2319 case OP_NOTUPTOI:
2320 case OP_MINUPTO:
2321 case OP_MINUPTOI:
2322 case OP_NOTMINUPTO:
2323 case OP_NOTMINUPTOI:
2324 case OP_POSUPTO:
2325 case OP_POSUPTOI:
2326 case OP_NOTPOSUPTO:
2327 case OP_NOTPOSUPTOI:
2328 case OP_STAR:
2329 case OP_STARI:
2330 case OP_NOTSTAR:
2331 case OP_NOTSTARI:
2332 case OP_MINSTAR:
2333 case OP_MINSTARI:
2334 case OP_NOTMINSTAR:
2335 case OP_NOTMINSTARI:
2336 case OP_POSSTAR:
2337 case OP_POSSTARI:
2338 case OP_NOTPOSSTAR:
2339 case OP_NOTPOSSTARI:
2340 case OP_PLUS:
2341 case OP_PLUSI:
2342 case OP_NOTPLUS:
2343 case OP_NOTPLUSI:
2344 case OP_MINPLUS:
2345 case OP_MINPLUSI:
2346 case OP_NOTMINPLUS:
2347 case OP_NOTMINPLUSI:
2348 case OP_POSPLUS:
2349 case OP_POSPLUSI:
2350 case OP_NOTPOSPLUS:
2351 case OP_NOTPOSPLUSI:
2352 case OP_QUERY:
2353 case OP_QUERYI:
2354 case OP_NOTQUERY:
2355 case OP_NOTQUERYI:
2356 case OP_MINQUERY:
2357 case OP_MINQUERYI:
2358 case OP_NOTMINQUERY:
2359 case OP_NOTMINQUERYI:
2360 case OP_POSQUERY:
2361 case OP_POSQUERYI:
2362 case OP_NOTPOSQUERY:
2363 case OP_NOTPOSQUERYI:
2364 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2365 break;
2366 }
2367#else
2368 (void)(utf); /* Keep compiler happy by referencing function argument */
2369#endif
2370 }
2371 }
2372}
2373
2374
2375
2376/*************************************************
2377* Scan compiled branch for non-emptiness *
2378*************************************************/
2379
2380/* This function scans through a branch of a compiled pattern to see whether it
2381can match the empty string or not. It is called from could_be_empty()
2382below and from compile_branch() when checking for an unlimited repeat of a
2383group that can match nothing. Note that first_significant_code() skips over
2384backward and negative forward assertions when its final argument is TRUE. If we
2385hit an unclosed bracket, we return "empty" - this means we've struck an inner
2386bracket whose current branch will already have been scanned.
2387
2388Arguments:
2389 code points to start of search
2390 endcode points to where to stop
2391 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2392 cd contains pointers to tables etc.
2393 recurses chain of recurse_check to catch mutual recursion
2394
2395Returns: TRUE if what is matched could be empty
2396*/
2397
2398static BOOL
2399could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2400 BOOL utf, compile_data *cd, recurse_check *recurses)
2401{
2402register pcre_uchar c;
2403recurse_check this_recurse;
2404
2405for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2406 code < endcode;
2407 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2408 {
2409 const pcre_uchar *ccode;
2410
2411 c = *code;
2412
2413 /* Skip over forward assertions; the other assertions are skipped by
2414 first_significant_code() with a TRUE final argument. */
2415
2416 if (c == OP_ASSERT)
2417 {
2418 do code += GET(code, 1); while (*code == OP_ALT);
2419 c = *code;
2420 continue;
2421 }
2422
2423 /* For a recursion/subroutine call, if its end has been reached, which
2424 implies a backward reference subroutine call, we can scan it. If it's a
2425 forward reference subroutine call, we can't. To detect forward reference
2426 we have to scan up the list that is kept in the workspace. This function is
2427 called only when doing the real compile, not during the pre-compile that
2428 measures the size of the compiled pattern. */
2429
2430 if (c == OP_RECURSE)
2431 {
2432 const pcre_uchar *scode = cd->start_code + GET(code, 1);
2433 const pcre_uchar *endgroup = scode;
2434 BOOL empty_branch;
2435
2436 /* Test for forward reference or uncompleted reference. This is disabled
2437 when called to scan a completed pattern by setting cd->start_workspace to
2438 NULL. */
2439
2440 if (cd->start_workspace != NULL)
2441 {
2442 const pcre_uchar *tcode;
2443 for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2444 if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2445 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2446 }
2447
2448 /* If the reference is to a completed group, we need to detect whether this
2449 is a recursive call, as otherwise there will be an infinite loop. If it is
2450 a recursion, just skip over it. Simple recursions are easily detected. For
2451 mutual recursions we keep a chain on the stack. */
2452
2453 do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2454 if (code >= scode && code <= endgroup) continue; /* Simple recursion */
2455 else
2456 {
2457 recurse_check *r = recurses;
2458 for (r = recurses; r != NULL; r = r->prev)
2459 if (r->group == scode) break;
2460 if (r != NULL) continue; /* Mutual recursion */
2461 }
2462
2463 /* Completed reference; scan the referenced group, remembering it on the
2464 stack chain to detect mutual recursions. */
2465
2466 empty_branch = FALSE;
2467 this_recurse.prev = recurses;
2468 this_recurse.group = scode;
2469
2470 do
2471 {
2472 if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2473 {
2474 empty_branch = TRUE;
2475 break;
2476 }
2477 scode += GET(scode, 1);
2478 }
2479 while (*scode == OP_ALT);
2480
2481 if (!empty_branch) return FALSE; /* All branches are non-empty */
2482 continue;
2483 }
2484
2485 /* Groups with zero repeats can of course be empty; skip them. */
2486
2487 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2488 c == OP_BRAPOSZERO)
2489 {
2490 code += PRIV(OP_lengths)[c];
2491 do code += GET(code, 1); while (*code == OP_ALT);
2492 c = *code;
2493 continue;
2494 }
2495
2496 /* A nested group that is already marked as "could be empty" can just be
2497 skipped. */
2498
2499 if (c == OP_SBRA || c == OP_SBRAPOS ||
2500 c == OP_SCBRA || c == OP_SCBRAPOS)
2501 {
2502 do code += GET(code, 1); while (*code == OP_ALT);
2503 c = *code;
2504 continue;
2505 }
2506
2507 /* For other groups, scan the branches. */
2508
2509 if (c == OP_BRA || c == OP_BRAPOS ||
2510 c == OP_CBRA || c == OP_CBRAPOS ||
2511 c == OP_ONCE || c == OP_ONCE_NC ||
2512 c == OP_COND || c == OP_SCOND)
2513 {
2514 BOOL empty_branch;
2515 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2516
2517 /* If a conditional group has only one branch, there is a second, implied,
2518 empty branch, so just skip over the conditional, because it could be empty.
2519 Otherwise, scan the individual branches of the group. */
2520
2521 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2522 code += GET(code, 1);
2523 else
2524 {
2525 empty_branch = FALSE;
2526 do
2527 {
2528 if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
2529 recurses)) empty_branch = TRUE;
2530 code += GET(code, 1);
2531 }
2532 while (*code == OP_ALT);
2533 if (!empty_branch) return FALSE; /* All branches are non-empty */
2534 }
2535
2536 c = *code;
2537 continue;
2538 }
2539
2540 /* Handle the other opcodes */
2541
2542 switch (c)
2543 {
2544 /* Check for quantifiers after a class. XCLASS is used for classes that
2545 cannot be represented just by a bit map. This includes negated single
2546 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2547 actual length is stored in the compiled code, so we must update "code"
2548 here. */
2549
2550#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2551 case OP_XCLASS:
2552 ccode = code += GET(code, 1);
2553 goto CHECK_CLASS_REPEAT;
2554#endif
2555
2556 case OP_CLASS:
2557 case OP_NCLASS:
2558 ccode = code + PRIV(OP_lengths)[OP_CLASS];
2559
2560#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2561 CHECK_CLASS_REPEAT:
2562#endif
2563
2564 switch (*ccode)
2565 {
2566 case OP_CRSTAR: /* These could be empty; continue */
2567 case OP_CRMINSTAR:
2568 case OP_CRQUERY:
2569 case OP_CRMINQUERY:
2570 case OP_CRPOSSTAR:
2571 case OP_CRPOSQUERY:
2572 break;
2573
2574 default: /* Non-repeat => class must match */
2575 case OP_CRPLUS: /* These repeats aren't empty */
2576 case OP_CRMINPLUS:
2577 case OP_CRPOSPLUS:
2578 return FALSE;
2579
2580 case OP_CRRANGE:
2581 case OP_CRMINRANGE:
2582 case OP_CRPOSRANGE:
2583 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2584 break;
2585 }
2586 break;
2587
2588 /* Opcodes that must match a character */
2589
2590 case OP_ANY:
2591 case OP_ALLANY:
2592 case OP_ANYBYTE:
2593
2594 case OP_PROP:
2595 case OP_NOTPROP:
2596 case OP_ANYNL:
2597
2598 case OP_NOT_HSPACE:
2599 case OP_HSPACE:
2600 case OP_NOT_VSPACE:
2601 case OP_VSPACE:
2602 case OP_EXTUNI:
2603
2604 case OP_NOT_DIGIT:
2605 case OP_DIGIT:
2606 case OP_NOT_WHITESPACE:
2607 case OP_WHITESPACE:
2608 case OP_NOT_WORDCHAR:
2609 case OP_WORDCHAR:
2610
2611 case OP_CHAR:
2612 case OP_CHARI:
2613 case OP_NOT:
2614 case OP_NOTI:
2615
2616 case OP_PLUS:
2617 case OP_PLUSI:
2618 case OP_MINPLUS:
2619 case OP_MINPLUSI:
2620
2621 case OP_NOTPLUS:
2622 case OP_NOTPLUSI:
2623 case OP_NOTMINPLUS:
2624 case OP_NOTMINPLUSI:
2625
2626 case OP_POSPLUS:
2627 case OP_POSPLUSI:
2628 case OP_NOTPOSPLUS:
2629 case OP_NOTPOSPLUSI:
2630
2631 case OP_EXACT:
2632 case OP_EXACTI:
2633 case OP_NOTEXACT:
2634 case OP_NOTEXACTI:
2635
2636 case OP_TYPEPLUS:
2637 case OP_TYPEMINPLUS:
2638 case OP_TYPEPOSPLUS:
2639 case OP_TYPEEXACT:
2640
2641 return FALSE;
2642
2643 /* These are going to continue, as they may be empty, but we have to
2644 fudge the length for the \p and \P cases. */
2645
2646 case OP_TYPESTAR:
2647 case OP_TYPEMINSTAR:
2648 case OP_TYPEPOSSTAR:
2649 case OP_TYPEQUERY:
2650 case OP_TYPEMINQUERY:
2651 case OP_TYPEPOSQUERY:
2652 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2653 break;
2654
2655 /* Same for these */
2656
2657 case OP_TYPEUPTO:
2658 case OP_TYPEMINUPTO:
2659 case OP_TYPEPOSUPTO:
2660 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2661 code += 2;
2662 break;
2663
2664 /* End of branch */
2665
2666 case OP_KET:
2667 case OP_KETRMAX:
2668 case OP_KETRMIN:
2669 case OP_KETRPOS:
2670 case OP_ALT:
2671 return TRUE;
2672
2673 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2674 MINUPTO, and POSUPTO and their caseless and negative versions may be
2675 followed by a multibyte character. */
2676
2677#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2678 case OP_STAR:
2679 case OP_STARI:
2680 case OP_NOTSTAR:
2681 case OP_NOTSTARI:
2682
2683 case OP_MINSTAR:
2684 case OP_MINSTARI:
2685 case OP_NOTMINSTAR:
2686 case OP_NOTMINSTARI:
2687
2688 case OP_POSSTAR:
2689 case OP_POSSTARI:
2690 case OP_NOTPOSSTAR:
2691 case OP_NOTPOSSTARI:
2692
2693 case OP_QUERY:
2694 case OP_QUERYI:
2695 case OP_NOTQUERY:
2696 case OP_NOTQUERYI:
2697
2698 case OP_MINQUERY:
2699 case OP_MINQUERYI:
2700 case OP_NOTMINQUERY:
2701 case OP_NOTMINQUERYI:
2702
2703 case OP_POSQUERY:
2704 case OP_POSQUERYI:
2705 case OP_NOTPOSQUERY:
2706 case OP_NOTPOSQUERYI:
2707
2708 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2709 break;
2710
2711 case OP_UPTO:
2712 case OP_UPTOI:
2713 case OP_NOTUPTO:
2714 case OP_NOTUPTOI:
2715
2716 case OP_MINUPTO:
2717 case OP_MINUPTOI:
2718 case OP_NOTMINUPTO:
2719 case OP_NOTMINUPTOI:
2720
2721 case OP_POSUPTO:
2722 case OP_POSUPTOI:
2723 case OP_NOTPOSUPTO:
2724 case OP_NOTPOSUPTOI:
2725
2726 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2727 break;
2728#endif
2729
2730 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2731 string. */
2732
2733 case OP_MARK:
2734 case OP_PRUNE_ARG:
2735 case OP_SKIP_ARG:
2736 case OP_THEN_ARG:
2737 code += code[1];
2738 break;
2739
2740 /* None of the remaining opcodes are required to match a character. */
2741
2742 default:
2743 break;
2744 }
2745 }
2746
2747return TRUE;
2748}
2749
2750
2751
2752/*************************************************
2753* Scan compiled regex for non-emptiness *
2754*************************************************/
2755
2756/* This function is called to check for left recursive calls. We want to check
2757the current branch of the current pattern to see if it could match the empty
2758string. If it could, we must look outwards for branches at other levels,
2759stopping when we pass beyond the bracket which is the subject of the recursion.
2760This function is called only during the real compile, not during the
2761pre-compile.
2762
2763Arguments:
2764 code points to start of the recursion
2765 endcode points to where to stop (current RECURSE item)
2766 bcptr points to the chain of current (unclosed) branch starts
2767 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2768 cd pointers to tables etc
2769
2770Returns: TRUE if what is matched could be empty
2771*/
2772
2773static BOOL
2774could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2775 branch_chain *bcptr, BOOL utf, compile_data *cd)
2776{
2777while (bcptr != NULL && bcptr->current_branch >= code)
2778 {
2779 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2780 return FALSE;
2781 bcptr = bcptr->outer;
2782 }
2783return TRUE;
2784}
2785
2786
2787
2788/*************************************************
2789* Base opcode of repeated opcodes *
2790*************************************************/
2791
2792/* Returns the base opcode for repeated single character type opcodes. If the
2793opcode is not a repeated character type, it returns with the original value.
2794
2795Arguments: c opcode
2796Returns: base opcode for the type
2797*/
2798
2799static pcre_uchar
2800get_repeat_base(pcre_uchar c)
2801{
2802return (c > OP_TYPEPOSUPTO)? c :
2803 (c >= OP_TYPESTAR)? OP_TYPESTAR :
2804 (c >= OP_NOTSTARI)? OP_NOTSTARI :
2805 (c >= OP_NOTSTAR)? OP_NOTSTAR :
2806 (c >= OP_STARI)? OP_STARI :
2807 OP_STAR;
2808}
2809
2810
2811
2812#ifdef SUPPORT_UCP
2813/*************************************************
2814* Check a character and a property *
2815*************************************************/
2816
2817/* This function is called by check_auto_possessive() when a property item
2818is adjacent to a fixed character.
2819
2820Arguments:
2821 c the character
2822 ptype the property type
2823 pdata the data for the type
2824 negated TRUE if it's a negated property (\P or \p{^)
2825
2826Returns: TRUE if auto-possessifying is OK
2827*/
2828
2829static BOOL
2830check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2831 BOOL negated)
2832{
2833const pcre_uint32 *p;
2834const ucd_record *prop = GET_UCD(c);
2835
2836switch(ptype)
2837 {
2838 case PT_LAMP:
2839 return (prop->chartype == ucp_Lu ||
2840 prop->chartype == ucp_Ll ||
2841 prop->chartype == ucp_Lt) == negated;
2842
2843 case PT_GC:
2844 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2845
2846 case PT_PC:
2847 return (pdata == prop->chartype) == negated;
2848
2849 case PT_SC:
2850 return (pdata == prop->script) == negated;
2851
2852 /* These are specials */
2853
2854 case PT_ALNUM:
2855 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2856 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2857
2858 /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2859 means that Perl space and POSIX space are now identical. PCRE was changed
2860 at release 8.34. */
2861
2862 case PT_SPACE: /* Perl space */
2863 case PT_PXSPACE: /* POSIX space */
2864 switch(c)
2865 {
2866 HSPACE_CASES:
2867 VSPACE_CASES:
2868 return negated;
2869
2870 default:
2871 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2872 }
2873 break; /* Control never reaches here */
2874
2875 case PT_WORD:
2876 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2877 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2878 c == CHAR_UNDERSCORE) == negated;
2879
2880 case PT_CLIST:
2881 p = PRIV(ucd_caseless_sets) + prop->caseset;
2882 for (;;)
2883 {
2884 if (c < *p) return !negated;
2885 if (c == *p++) return negated;
2886 }
2887 break; /* Control never reaches here */
2888 }
2889
2890return FALSE;
2891}
2892#endif /* SUPPORT_UCP */
2893
2894
2895
2896/*************************************************
2897* Fill the character property list *
2898*************************************************/
2899
2900/* Checks whether the code points to an opcode that can take part in auto-
2901possessification, and if so, fills a list with its properties.
2902
2903Arguments:
2904 code points to start of expression
2905 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2906 fcc points to case-flipping table
2907 list points to output list
2908 list[0] will be filled with the opcode
2909 list[1] will be non-zero if this opcode
2910 can match an empty character string
2911 list[2..7] depends on the opcode
2912
2913Returns: points to the start of the next opcode if *code is accepted
2914 NULL if *code is not accepted
2915*/
2916
2917static const pcre_uchar *
2918get_chr_property_list(const pcre_uchar *code, BOOL utf,
2919 const pcre_uint8 *fcc, pcre_uint32 *list)
2920{
2921pcre_uchar c = *code;
2922pcre_uchar base;
2923const pcre_uchar *end;
2924pcre_uint32 chr;
2925
2926#ifdef SUPPORT_UCP
2927pcre_uint32 *clist_dest;
2928const pcre_uint32 *clist_src;
2929#else
2930utf = utf; /* Suppress "unused parameter" compiler warning */
2931#endif
2932
2933list[0] = c;
2934list[1] = FALSE;
2935code++;
2936
2937if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2938 {
2939 base = get_repeat_base(c);
2940 c -= (base - OP_STAR);
2941
2942 if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2943 code += IMM2_SIZE;
2944
2945 list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2946
2947 switch(base)
2948 {
2949 case OP_STAR:
2950 list[0] = OP_CHAR;
2951 break;
2952
2953 case OP_STARI:
2954 list[0] = OP_CHARI;
2955 break;
2956
2957 case OP_NOTSTAR:
2958 list[0] = OP_NOT;
2959 break;
2960
2961 case OP_NOTSTARI:
2962 list[0] = OP_NOTI;
2963 break;
2964
2965 case OP_TYPESTAR:
2966 list[0] = *code;
2967 code++;
2968 break;
2969 }
2970 c = list[0];
2971 }
2972
2973switch(c)
2974 {
2975 case OP_NOT_DIGIT:
2976 case OP_DIGIT:
2977 case OP_NOT_WHITESPACE:
2978 case OP_WHITESPACE:
2979 case OP_NOT_WORDCHAR:
2980 case OP_WORDCHAR:
2981 case OP_ANY:
2982 case OP_ALLANY:
2983 case OP_ANYNL:
2984 case OP_NOT_HSPACE:
2985 case OP_HSPACE:
2986 case OP_NOT_VSPACE:
2987 case OP_VSPACE:
2988 case OP_EXTUNI:
2989 case OP_EODN:
2990 case OP_EOD:
2991 case OP_DOLL:
2992 case OP_DOLLM:
2993 return code;
2994
2995 case OP_CHAR:
2996 case OP_NOT:
2997 GETCHARINCTEST(chr, code);
2998 list[2] = chr;
2999 list[3] = NOTACHAR;
3000 return code;
3001
3002 case OP_CHARI:
3003 case OP_NOTI:
3004 list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
3005 GETCHARINCTEST(chr, code);
3006 list[2] = chr;
3007
3008#ifdef SUPPORT_UCP
3009 if (chr < 128 || (chr < 256 && !utf))
3010 list[3] = fcc[chr];
3011 else
3012 list[3] = UCD_OTHERCASE(chr);
3013#elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
3014 list[3] = (chr < 256) ? fcc[chr] : chr;
3015#else
3016 list[3] = fcc[chr];
3017#endif
3018
3019 /* The othercase might be the same value. */
3020
3021 if (chr == list[3])
3022 list[3] = NOTACHAR;
3023 else
3024 list[4] = NOTACHAR;
3025 return code;
3026
3027#ifdef SUPPORT_UCP
3028 case OP_PROP:
3029 case OP_NOTPROP:
3030 if (code[0] != PT_CLIST)
3031 {
3032 list[2] = code[0];
3033 list[3] = code[1];
3034 return code + 2;
3035 }
3036
3037 /* Convert only if we have enough space. */
3038
3039 clist_src = PRIV(ucd_caseless_sets) + code[1];
3040 clist_dest = list + 2;
3041 code += 2;
3042
3043 do {
3044 if (clist_dest >= list + 8)
3045 {
3046 /* Early return if there is not enough space. This should never
3047 happen, since all clists are shorter than 5 character now. */
3048 list[2] = code[0];
3049 list[3] = code[1];
3050 return code;
3051 }
3052 *clist_dest++ = *clist_src;
3053 }
3054 while(*clist_src++ != NOTACHAR);
3055
3056 /* All characters are stored. The terminating NOTACHAR
3057 is copied form the clist itself. */
3058
3059 list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3060 return code;
3061#endif
3062
3063 case OP_NCLASS:
3064 case OP_CLASS:
3065#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3066 case OP_XCLASS:
3067 if (c == OP_XCLASS)
3068 end = code + GET(code, 0) - 1;
3069 else
3070#endif
3071 end = code + 32 / sizeof(pcre_uchar);
3072
3073 switch(*end)
3074 {
3075 case OP_CRSTAR:
3076 case OP_CRMINSTAR:
3077 case OP_CRQUERY:
3078 case OP_CRMINQUERY:
3079 case OP_CRPOSSTAR:
3080 case OP_CRPOSQUERY:
3081 list[1] = TRUE;
3082 end++;
3083 break;
3084
3085 case OP_CRPLUS:
3086 case OP_CRMINPLUS:
3087 case OP_CRPOSPLUS:
3088 end++;
3089 break;
3090
3091 case OP_CRRANGE:
3092 case OP_CRMINRANGE:
3093 case OP_CRPOSRANGE:
3094 list[1] = (GET2(end, 1) == 0);
3095 end += 1 + 2 * IMM2_SIZE;
3096 break;
3097 }
3098 list[2] = (pcre_uint32)(end - code);
3099 return end;
3100 }
3101return NULL; /* Opcode not accepted */
3102}
3103
3104
3105
3106/*************************************************
3107* Scan further character sets for match *
3108*************************************************/
3109
3110/* Checks whether the base and the current opcode have a common character, in
3111which case the base cannot be possessified.
3112
3113Arguments:
3114 code points to the byte code
3115 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3116 cd static compile data
3117 base_list the data list of the base opcode
3118
3119Returns: TRUE if the auto-possessification is possible
3120*/
3121
3122static BOOL
3123compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3124 const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
3125{
3126pcre_uchar c;
3127pcre_uint32 list[8];
3128const pcre_uint32 *chr_ptr;
3129const pcre_uint32 *ochr_ptr;
3130const pcre_uint32 *list_ptr;
3131const pcre_uchar *next_code;
3132#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3133const pcre_uchar *xclass_flags;
3134#endif
3135const pcre_uint8 *class_bitset;
3136const pcre_uint8 *set1, *set2, *set_end;
3137pcre_uint32 chr;
3138BOOL accepted, invert_bits;
3139BOOL entered_a_group = FALSE;
3140
3141if (*rec_limit == 0) return FALSE;
3142--(*rec_limit);
3143
3144/* Note: the base_list[1] contains whether the current opcode has greedy
3145(represented by a non-zero value) quantifier. This is a different from
3146other character type lists, which stores here that the character iterator
3147matches to an empty string (also represented by a non-zero value). */
3148
3149for(;;)
3150 {
3151 /* All operations move the code pointer forward.
3152 Therefore infinite recursions are not possible. */
3153
3154 c = *code;
3155
3156 /* Skip over callouts */
3157
3158 if (c == OP_CALLOUT)
3159 {
3160 code += PRIV(OP_lengths)[c];
3161 continue;
3162 }
3163
3164 if (c == OP_ALT)
3165 {
3166 do code += GET(code, 1); while (*code == OP_ALT);
3167 c = *code;
3168 }
3169
3170 switch(c)
3171 {
3172 case OP_END:
3173 case OP_KETRPOS:
3174 /* TRUE only in greedy case. The non-greedy case could be replaced by
3175 an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3176 uses more memory, which we cannot get at this stage.) */
3177
3178 return base_list[1] != 0;
3179
3180 case OP_KET:
3181 /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3182 it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3183 cannot be converted to a possessive form. */
3184
3185 if (base_list[1] == 0) return FALSE;
3186
3187 switch(*(code - GET(code, 1)))
3188 {
3189 case OP_ASSERT:
3190 case OP_ASSERT_NOT:
3191 case OP_ASSERTBACK:
3192 case OP_ASSERTBACK_NOT:
3193 case OP_ONCE:
3194 case OP_ONCE_NC:
3195 /* Atomic sub-patterns and assertions can always auto-possessify their
3196 last iterator. However, if the group was entered as a result of checking
3197 a previous iterator, this is not possible. */
3198
3199 return !entered_a_group;
3200 }
3201
3202 code += PRIV(OP_lengths)[c];
3203 continue;
3204
3205 case OP_ONCE:
3206 case OP_ONCE_NC:
3207 case OP_BRA:
3208 case OP_CBRA:
3209 next_code = code + GET(code, 1);
3210 code += PRIV(OP_lengths)[c];
3211
3212 while (*next_code == OP_ALT)
3213 {
3214 if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
3215 return FALSE;
3216 code = next_code + 1 + LINK_SIZE;
3217 next_code += GET(next_code, 1);
3218 }
3219
3220 entered_a_group = TRUE;
3221 continue;
3222
3223 case OP_BRAZERO:
3224 case OP_BRAMINZERO:
3225
3226 next_code = code + 1;
3227 if (*next_code != OP_BRA && *next_code != OP_CBRA
3228 && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3229
3230 do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3231
3232 /* The bracket content will be checked by the
3233 OP_BRA/OP_CBRA case above. */
3234 next_code += 1 + LINK_SIZE;
3235 if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
3236 return FALSE;
3237
3238 code += PRIV(OP_lengths)[c];
3239 continue;
3240
3241 default:
3242 break;
3243 }
3244
3245 /* Check for a supported opcode, and load its properties. */
3246
3247 code = get_chr_property_list(code, utf, cd->fcc, list);
3248 if (code == NULL) return FALSE; /* Unsupported */
3249
3250 /* If either opcode is a small character list, set pointers for comparing
3251 characters from that list with another list, or with a property. */
3252
3253 if (base_list[0] == OP_CHAR)
3254 {
3255 chr_ptr = base_list + 2;
3256 list_ptr = list;
3257 }
3258 else if (list[0] == OP_CHAR)
3259 {
3260 chr_ptr = list + 2;
3261 list_ptr = base_list;
3262 }
3263
3264 /* Character bitsets can also be compared to certain opcodes. */
3265
3266 else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3267#ifdef COMPILE_PCRE8
3268 /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3269 || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3270#endif
3271 )
3272 {
3273#ifdef COMPILE_PCRE8
3274 if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3275#else
3276 if (base_list[0] == OP_CLASS)
3277#endif
3278 {
3279 set1 = (pcre_uint8 *)(base_end - base_list[2]);
3280 list_ptr = list;
3281 }
3282 else
3283 {
3284 set1 = (pcre_uint8 *)(code - list[2]);
3285 list_ptr = base_list;
3286 }
3287
3288 invert_bits = FALSE;
3289 switch(list_ptr[0])
3290 {
3291 case OP_CLASS:
3292 case OP_NCLASS:
3293 set2 = (pcre_uint8 *)
3294 ((list_ptr == list ? code : base_end) - list_ptr[2]);
3295 break;
3296
3297#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3298 case OP_XCLASS:
3299 xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
3300 if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
3301 if ((*xclass_flags & XCL_MAP) == 0)
3302 {
3303 /* No bits are set for characters < 256. */
3304 if (list[1] == 0) return TRUE;
3305 /* Might be an empty repeat. */
3306 continue;
3307 }
3308 set2 = (pcre_uint8 *)(xclass_flags + 1);
3309 break;
3310#endif
3311
3312 case OP_NOT_DIGIT:
3313 invert_bits = TRUE;
3314 /* Fall through */
3315 case OP_DIGIT:
3316 set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3317 break;
3318
3319 case OP_NOT_WHITESPACE:
3320 invert_bits = TRUE;
3321 /* Fall through */
3322 case OP_WHITESPACE:
3323 set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3324 break;
3325
3326 case OP_NOT_WORDCHAR:
3327 invert_bits = TRUE;
3328 /* Fall through */
3329 case OP_WORDCHAR:
3330 set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3331 break;
3332
3333 default:
3334 return FALSE;
3335 }
3336
3337 /* Because the sets are unaligned, we need
3338 to perform byte comparison here. */
3339 set_end = set1 + 32;
3340 if (invert_bits)
3341 {
3342 do
3343 {
3344 if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3345 }
3346 while (set1 < set_end);
3347 }
3348 else
3349 {
3350 do
3351 {
3352 if ((*set1++ & *set2++) != 0) return FALSE;
3353 }
3354 while (set1 < set_end);
3355 }
3356
3357 if (list[1] == 0) return TRUE;
3358 /* Might be an empty repeat. */
3359 continue;
3360 }
3361
3362 /* Some property combinations also acceptable. Unicode property opcodes are
3363 processed specially; the rest can be handled with a lookup table. */
3364
3365 else
3366 {
3367 pcre_uint32 leftop, rightop;
3368
3369 leftop = base_list[0];
3370 rightop = list[0];
3371
3372#ifdef SUPPORT_UCP
3373 accepted = FALSE; /* Always set in non-unicode case. */
3374 if (leftop == OP_PROP || leftop == OP_NOTPROP)
3375 {
3376 if (rightop == OP_EOD)
3377 accepted = TRUE;
3378 else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3379 {
3380 int n;
3381 const pcre_uint8 *p;
3382 BOOL same = leftop == rightop;
3383 BOOL lisprop = leftop == OP_PROP;
3384 BOOL risprop = rightop == OP_PROP;
3385 BOOL bothprop = lisprop && risprop;
3386
3387 /* There's a table that specifies how each combination is to be
3388 processed:
3389 0 Always return FALSE (never auto-possessify)
3390 1 Character groups are distinct (possessify if both are OP_PROP)
3391 2 Check character categories in the same group (general or particular)
3392 3 Return TRUE if the two opcodes are not the same
3393 ... see comments below
3394 */
3395
3396 n = propposstab[base_list[2]][list[2]];
3397 switch(n)
3398 {
3399 case 0: break;
3400 case 1: accepted = bothprop; break;
3401 case 2: accepted = (base_list[3] == list[3]) != same; break;
3402 case 3: accepted = !same; break;
3403
3404 case 4: /* Left general category, right particular category */
3405 accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3406 break;
3407
3408 case 5: /* Right general category, left particular category */
3409 accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3410 break;
3411
3412 /* This code is logically tricky. Think hard before fiddling with it.
3413 The posspropstab table has four entries per row. Each row relates to
3414 one of PCRE's special properties such as ALNUM or SPACE or WORD.
3415 Only WORD actually needs all four entries, but using repeats for the
3416 others means they can all use the same code below.
3417
3418 The first two entries in each row are Unicode general categories, and
3419 apply always, because all the characters they include are part of the
3420 PCRE character set. The third and fourth entries are a general and a
3421 particular category, respectively, that include one or more relevant
3422 characters. One or the other is used, depending on whether the check
3423 is for a general or a particular category. However, in both cases the
3424 category contains more characters than the specials that are defined
3425 for the property being tested against. Therefore, it cannot be used
3426 in a NOTPROP case.
3427
3428 Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3429 Underscore is covered by ucp_P or ucp_Po. */
3430
3431 case 6: /* Left alphanum vs right general category */
3432 case 7: /* Left space vs right general category */
3433 case 8: /* Left word vs right general category */
3434 p = posspropstab[n-6];
3435 accepted = risprop && lisprop ==
3436 (list[3] != p[0] &&
3437 list[3] != p[1] &&
3438 (list[3] != p[2] || !lisprop));
3439 break;
3440
3441 case 9: /* Right alphanum vs left general category */
3442 case 10: /* Right space vs left general category */
3443 case 11: /* Right word vs left general category */
3444 p = posspropstab[n-9];
3445 accepted = lisprop && risprop ==
3446 (base_list[3] != p[0] &&
3447 base_list[3] != p[1] &&
3448 (base_list[3] != p[2] || !risprop));
3449 break;
3450
3451 case 12: /* Left alphanum vs right particular category */
3452 case 13: /* Left space vs right particular category */
3453 case 14: /* Left word vs right particular category */
3454 p = posspropstab[n-12];
3455 accepted = risprop && lisprop ==
3456 (catposstab[p[0]][list[3]] &&
3457 catposstab[p[1]][list[3]] &&
3458 (list[3] != p[3] || !lisprop));
3459 break;
3460
3461 case 15: /* Right alphanum vs left particular category */
3462 case 16: /* Right space vs left particular category */
3463 case 17: /* Right word vs left particular category */
3464 p = posspropstab[n-15];
3465 accepted = lisprop && risprop ==
3466 (catposstab[p[0]][base_list[3]] &&
3467 catposstab[p[1]][base_list[3]] &&
3468 (base_list[3] != p[3] || !risprop));
3469 break;
3470 }
3471 }
3472 }
3473
3474 else
3475#endif /* SUPPORT_UCP */
3476
3477 accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3478 rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3479 autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3480
3481 if (!accepted) return FALSE;
3482
3483 if (list[1] == 0) return TRUE;
3484 /* Might be an empty repeat. */
3485 continue;
3486 }
3487
3488 /* Control reaches here only if one of the items is a small character list.
3489 All characters are checked against the other side. */
3490
3491 do
3492 {
3493 chr = *chr_ptr;
3494
3495 switch(list_ptr[0])
3496 {
3497 case OP_CHAR:
3498 ochr_ptr = list_ptr + 2;
3499 do
3500 {
3501 if (chr == *ochr_ptr) return FALSE;
3502 ochr_ptr++;
3503 }
3504 while(*ochr_ptr != NOTACHAR);
3505 break;
3506
3507 case OP_NOT:
3508 ochr_ptr = list_ptr + 2;
3509 do
3510 {
3511 if (chr == *ochr_ptr)
3512 break;
3513 ochr_ptr++;
3514 }
3515 while(*ochr_ptr != NOTACHAR);
3516 if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */
3517 break;
3518
3519 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3520 set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3521
3522 case OP_DIGIT:
3523 if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3524 break;
3525
3526 case OP_NOT_DIGIT:
3527 if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3528 break;
3529
3530 case OP_WHITESPACE:
3531 if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3532 break;
3533
3534 case OP_NOT_WHITESPACE:
3535 if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3536 break;
3537
3538 case OP_WORDCHAR:
3539 if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3540 break;
3541
3542 case OP_NOT_WORDCHAR:
3543 if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3544 break;
3545
3546 case OP_HSPACE:
3547 switch(chr)
3548 {
3549 HSPACE_CASES: return FALSE;
3550 default: break;
3551 }
3552 break;
3553
3554 case OP_NOT_HSPACE:
3555 switch(chr)
3556 {
3557 HSPACE_CASES: break;
3558 default: return FALSE;
3559 }
3560 break;
3561
3562 case OP_ANYNL:
3563 case OP_VSPACE:
3564 switch(chr)
3565 {
3566 VSPACE_CASES: return FALSE;
3567 default: break;
3568 }
3569 break;
3570
3571 case OP_NOT_VSPACE:
3572 switch(chr)
3573 {
3574 VSPACE_CASES: break;
3575 default: return FALSE;
3576 }
3577 break;
3578
3579 case OP_DOLL:
3580 case OP_EODN:
3581 switch (chr)
3582 {
3583 case CHAR_CR:
3584 case CHAR_LF:
3585 case CHAR_VT:
3586 case CHAR_FF:
3587 case CHAR_NEL:
3588#ifndef EBCDIC
3589 case 0x2028:
3590 case 0x2029:
3591#endif /* Not EBCDIC */
3592 return FALSE;
3593 }
3594 break;
3595
3596 case OP_EOD: /* Can always possessify before \z */
3597 break;
3598
3599#ifdef SUPPORT_UCP
3600 case OP_PROP:
3601 case OP_NOTPROP:
3602 if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3603 list_ptr[0] == OP_NOTPROP))
3604 return FALSE;
3605 break;
3606#endif
3607
3608 case OP_NCLASS:
3609 if (chr > 255) return FALSE;
3610 /* Fall through */
3611
3612 case OP_CLASS:
3613 if (chr > 255) break;
3614 class_bitset = (pcre_uint8 *)
3615 ((list_ptr == list ? code : base_end) - list_ptr[2]);
3616 if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3617 break;
3618
3619#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3620 case OP_XCLASS:
3621 if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3622 list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3623 break;
3624#endif
3625
3626 default:
3627 return FALSE;
3628 }
3629
3630 chr_ptr++;
3631 }
3632 while(*chr_ptr != NOTACHAR);
3633
3634 /* At least one character must be matched from this opcode. */
3635
3636 if (list[1] == 0) return TRUE;
3637 }
3638
3639/* Control never reaches here. There used to be a fail-save return FALSE; here,
3640but some compilers complain about an unreachable statement. */
3641
3642}
3643
3644
3645
3646/*************************************************
3647* Scan compiled regex for auto-possession *
3648*************************************************/
3649
3650/* Replaces single character iterations with their possessive alternatives
3651if appropriate. This function modifies the compiled opcode!
3652
3653Arguments:
3654 code points to start of the byte code
3655 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3656 cd static compile data
3657
3658Returns: nothing
3659*/
3660
3661static void
3662auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3663{
3664register pcre_uchar c;
3665const pcre_uchar *end;
3666pcre_uchar *repeat_opcode;
3667pcre_uint32 list[8];
3668int rec_limit;
3669
3670for (;;)
3671 {
3672 c = *code;
3673
3674 /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
3675 it may compile without complaining, but may get into a loop here if the code
3676 pointer points to a bad value. This is, of course a documentated possibility,
3677 when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
3678 just give up on this optimization. */
3679
3680 if (c >= OP_TABLE_LENGTH) return;
3681
3682 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3683 {
3684 c -= get_repeat_base(c) - OP_STAR;
3685 end = (c <= OP_MINUPTO) ?
3686 get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3687 list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3688
3689 rec_limit = 1000;
3690 if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
3691 {
3692 switch(c)
3693 {
3694 case OP_STAR:
3695 *code += OP_POSSTAR - OP_STAR;
3696 break;
3697
3698 case OP_MINSTAR:
3699 *code += OP_POSSTAR - OP_MINSTAR;
3700 break;
3701
3702 case OP_PLUS:
3703 *code += OP_POSPLUS - OP_PLUS;
3704 break;
3705
3706 case OP_MINPLUS:
3707 *code += OP_POSPLUS - OP_MINPLUS;
3708 break;
3709
3710 case OP_QUERY:
3711 *code += OP_POSQUERY - OP_QUERY;
3712 break;
3713
3714 case OP_MINQUERY:
3715 *code += OP_POSQUERY - OP_MINQUERY;
3716 break;
3717
3718 case OP_UPTO:
3719 *code += OP_POSUPTO - OP_UPTO;
3720 break;
3721
3722 case OP_MINUPTO:
3723 *code += OP_POSUPTO - OP_MINUPTO;
3724 break;
3725 }
3726 }
3727 c = *code;
3728 }
3729 else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3730 {
3731#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3732 if (c == OP_XCLASS)
3733 repeat_opcode = code + GET(code, 1);
3734 else
3735#endif
3736 repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3737
3738 c = *repeat_opcode;
3739 if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3740 {
3741 /* end must not be NULL. */
3742 end = get_chr_property_list(code, utf, cd->fcc, list);
3743
3744 list[1] = (c & 1) == 0;
3745
3746 rec_limit = 1000;
3747 if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
3748 {
3749 switch (c)
3750 {
3751 case OP_CRSTAR:
3752 case OP_CRMINSTAR:
3753 *repeat_opcode = OP_CRPOSSTAR;
3754 break;
3755
3756 case OP_CRPLUS:
3757 case OP_CRMINPLUS:
3758 *repeat_opcode = OP_CRPOSPLUS;
3759 break;
3760
3761 case OP_CRQUERY:
3762 case OP_CRMINQUERY:
3763 *repeat_opcode = OP_CRPOSQUERY;
3764 break;
3765
3766 case OP_CRRANGE:
3767 case OP_CRMINRANGE:
3768 *repeat_opcode = OP_CRPOSRANGE;
3769 break;
3770 }
3771 }
3772 }
3773 c = *code;
3774 }
3775
3776 switch(c)
3777 {
3778 case OP_END:
3779 return;
3780
3781 case OP_TYPESTAR:
3782 case OP_TYPEMINSTAR:
3783 case OP_TYPEPLUS:
3784 case OP_TYPEMINPLUS:
3785 case OP_TYPEQUERY:
3786 case OP_TYPEMINQUERY:
3787 case OP_TYPEPOSSTAR:
3788 case OP_TYPEPOSPLUS:
3789 case OP_TYPEPOSQUERY:
3790 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3791 break;
3792
3793 case OP_TYPEUPTO:
3794 case OP_TYPEMINUPTO:
3795 case OP_TYPEEXACT:
3796 case OP_TYPEPOSUPTO:
3797 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3798 code += 2;
3799 break;
3800
3801#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3802 case OP_XCLASS:
3803 code += GET(code, 1);
3804 break;
3805#endif
3806
3807 case OP_MARK:
3808 case OP_PRUNE_ARG:
3809 case OP_SKIP_ARG:
3810 case OP_THEN_ARG:
3811 code += code[1];
3812 break;
3813 }
3814
3815 /* Add in the fixed length from the table */
3816
3817 code += PRIV(OP_lengths)[c];
3818
3819 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3820 a multi-byte character. The length in the table is a minimum, so we have to
3821 arrange to skip the extra bytes. */
3822
3823#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3824 if (utf) switch(c)
3825 {
3826 case OP_CHAR:
3827 case OP_CHARI:
3828 case OP_NOT:
3829 case OP_NOTI:
3830 case OP_STAR:
3831 case OP_MINSTAR:
3832 case OP_PLUS:
3833 case OP_MINPLUS:
3834 case OP_QUERY:
3835 case OP_MINQUERY:
3836 case OP_UPTO:
3837 case OP_MINUPTO:
3838 case OP_EXACT:
3839 case OP_POSSTAR:
3840 case OP_POSPLUS:
3841 case OP_POSQUERY:
3842 case OP_POSUPTO:
3843 case OP_STARI:
3844 case OP_MINSTARI:
3845 case OP_PLUSI:
3846 case OP_MINPLUSI:
3847 case OP_QUERYI:
3848 case OP_MINQUERYI:
3849 case OP_UPTOI:
3850 case OP_MINUPTOI:
3851 case OP_EXACTI:
3852 case OP_POSSTARI:
3853 case OP_POSPLUSI:
3854 case OP_POSQUERYI:
3855 case OP_POSUPTOI:
3856 case OP_NOTSTAR:
3857 case OP_NOTMINSTAR:
3858 case OP_NOTPLUS:
3859 case OP_NOTMINPLUS:
3860 case OP_NOTQUERY:
3861 case OP_NOTMINQUERY:
3862 case OP_NOTUPTO:
3863 case OP_NOTMINUPTO:
3864 case OP_NOTEXACT:
3865 case OP_NOTPOSSTAR:
3866 case OP_NOTPOSPLUS:
3867 case OP_NOTPOSQUERY:
3868 case OP_NOTPOSUPTO:
3869 case OP_NOTSTARI:
3870 case OP_NOTMINSTARI:
3871 case OP_NOTPLUSI:
3872 case OP_NOTMINPLUSI:
3873 case OP_NOTQUERYI:
3874 case OP_NOTMINQUERYI:
3875 case OP_NOTUPTOI:
3876 case OP_NOTMINUPTOI:
3877 case OP_NOTEXACTI:
3878 case OP_NOTPOSSTARI:
3879 case OP_NOTPOSPLUSI:
3880 case OP_NOTPOSQUERYI:
3881 case OP_NOTPOSUPTOI:
3882 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3883 break;
3884 }
3885#else
3886 (void)(utf); /* Keep compiler happy by referencing function argument */
3887#endif
3888 }
3889}
3890
3891
3892
3893/*************************************************
3894* Check for POSIX class syntax *
3895*************************************************/
3896
3897/* This function is called when the sequence "[:" or "[." or "[=" is
3898encountered in a character class. It checks whether this is followed by a
3899sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3900reach an unescaped ']' without the special preceding character, return FALSE.
3901
3902Originally, this function only recognized a sequence of letters between the
3903terminators, but it seems that Perl recognizes any sequence of characters,
3904though of course unknown POSIX names are subsequently rejected. Perl gives an
3905"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3906didn't consider this to be a POSIX class. Likewise for [:1234:].
3907
3908The problem in trying to be exactly like Perl is in the handling of escapes. We
3909have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3910class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3911below handles the special cases \\ and \], but does not try to do any other
3912escape processing. This makes it different from Perl for cases such as
3913[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
3914not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
3915when Perl does, I think.
3916
3917A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3918It seems that the appearance of a nested POSIX class supersedes an apparent
3919external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3920a digit.
3921
3922In Perl, unescaped square brackets may also appear as part of class names. For
3923example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3924[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3925seem right at all. PCRE does not allow closing square brackets in POSIX class
3926names.
3927
3928Arguments:
3929 ptr pointer to the initial [
3930 endptr where to return the end pointer
3931
3932Returns: TRUE or FALSE
3933*/
3934
3935static BOOL
3936check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3937{
3938pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
3939terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
3940for (++ptr; *ptr != CHAR_NULL; ptr++)
3941 {
3942 if (*ptr == CHAR_BACKSLASH &&
3943 (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
3944 ptr[1] == CHAR_BACKSLASH))
3945 ptr++;
3946 else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
3947 *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3948 else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3949 {
3950 *endptr = ptr;
3951 return TRUE;
3952 }
3953 }
3954return FALSE;
3955}
3956
3957
3958
3959
3960/*************************************************
3961* Check POSIX class name *
3962*************************************************/
3963
3964/* This function is called to check the name given in a POSIX-style class entry
3965such as [:alnum:].
3966
3967Arguments:
3968 ptr points to the first letter
3969 len the length of the name
3970
3971Returns: a value representing the name, or -1 if unknown
3972*/
3973
3974static int
3975check_posix_name(const pcre_uchar *ptr, int len)
3976{
3977const char *pn = posix_names;
3978register int yield = 0;
3979while (posix_name_lengths[yield] != 0)
3980 {
3981 if (len == posix_name_lengths[yield] &&
3982 STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3983 pn += posix_name_lengths[yield] + 1;
3984 yield++;
3985 }
3986return -1;
3987}
3988
3989
3990/*************************************************
3991* Adjust OP_RECURSE items in repeated group *
3992*************************************************/
3993
3994/* OP_RECURSE items contain an offset from the start of the regex to the group
3995that is referenced. This means that groups can be replicated for fixed
3996repetition simply by copying (because the recursion is allowed to refer to
3997earlier groups that are outside the current group). However, when a group is
3998optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3999inserted before it, after it has been compiled. This means that any OP_RECURSE
4000items within it that refer to the group itself or any contained groups have to
4001have their offsets adjusted. That one of the jobs of this function. Before it
4002is called, the partially compiled regex must be temporarily terminated with
4003OP_END.
4004
4005This function has been extended to cope with forward references for recursions
4006and subroutine calls. It must check the list of such references for the
4007group we are dealing with. If it finds that one of the recursions in the
4008current group is on this list, it does not adjust the value in the reference
4009(which is a group number). After the group has been scanned, all the offsets in
4010the forward reference list for the group are adjusted.
4011
4012Arguments:
4013 group points to the start of the group
4014 adjust the amount by which the group is to be moved
4015 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
4016 cd contains pointers to tables etc.
4017 save_hwm_offset the hwm forward reference offset at the start of the group
4018
4019Returns: nothing
4020*/
4021
4022static void
4023adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
4024 size_t save_hwm_offset)
4025{
4026int offset;
4027pcre_uchar *hc;
4028pcre_uchar *ptr = group;
4029
4030while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
4031 {
4032 for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4033 hc += LINK_SIZE)
4034 {
4035 offset = (int)GET(hc, 0);
4036 if (cd->start_code + offset == ptr + 1) break;
4037 }
4038
4039 /* If we have not found this recursion on the forward reference list, adjust
4040 the recursion's offset if it's after the start of this group. */
4041
4042 if (hc >= cd->hwm)
4043 {
4044 offset = (int)GET(ptr, 1);
4045 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
4046 }
4047
4048 ptr += 1 + LINK_SIZE;
4049 }
4050
4051/* Now adjust all forward reference offsets for the group. */
4052
4053for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4054 hc += LINK_SIZE)
4055 {
4056 offset = (int)GET(hc, 0);
4057 PUT(hc, 0, offset + adjust);
4058 }
4059}
4060
4061
4062
4063/*************************************************
4064* Insert an automatic callout point *
4065*************************************************/
4066
4067/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
4068callout points before each pattern item.
4069
4070Arguments:
4071 code current code pointer
4072 ptr current pattern pointer
4073 cd pointers to tables etc
4074
4075Returns: new code pointer
4076*/
4077
4078static pcre_uchar *
4079auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
4080{
4081*code++ = OP_CALLOUT;
4082*code++ = 255;
4083PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
4084PUT(code, LINK_SIZE, 0); /* Default length */
4085return code + 2 * LINK_SIZE;
4086}
4087
4088
4089
4090/*************************************************
4091* Complete a callout item *
4092*************************************************/
4093
4094/* A callout item contains the length of the next item in the pattern, which
4095we can't fill in till after we have reached the relevant point. This is used
4096for both automatic and manual callouts.
4097
4098Arguments:
4099 previous_callout points to previous callout item
4100 ptr current pattern pointer
4101 cd pointers to tables etc
4102
4103Returns: nothing
4104*/
4105
4106static void
4107complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
4108{
4109int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
4110PUT(previous_callout, 2 + LINK_SIZE, length);
4111}
4112
4113
4114
4115#ifdef SUPPORT_UCP
4116/*************************************************
4117* Get othercase range *
4118*************************************************/
4119
4120/* This function is passed the start and end of a class range, in UTF-8 mode
4121with UCP support. It searches up the characters, looking for ranges of
4122characters in the "other" case. Each call returns the next one, updating the
4123start address. A character with multiple other cases is returned on its own
4124with a special return value.
4125
4126Arguments:
4127 cptr points to starting character value; updated
4128 d end value
4129 ocptr where to put start of othercase range
4130 odptr where to put end of othercase range
4131
4132Yield: -1 when no more
4133 0 when a range is returned
4134 >0 the CASESET offset for char with multiple other cases
4135 in this case, ocptr contains the original
4136*/
4137
4138static int
4139get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4140 pcre_uint32 *odptr)
4141{
4142pcre_uint32 c, othercase, next;
4143unsigned int co;
4144
4145/* Find the first character that has an other case. If it has multiple other
4146cases, return its case offset value. */
4147
4148for (c = *cptr; c <= d; c++)
4149 {
4150 if ((co = UCD_CASESET(c)) != 0)
4151 {
4152 *ocptr = c++; /* Character that has the set */
4153 *cptr = c; /* Rest of input range */
4154 return (int)co;
4155 }
4156 if ((othercase = UCD_OTHERCASE(c)) != c) break;
4157 }
4158
4159if (c > d) return -1; /* Reached end of range */
4160
4161/* Found a character that has a single other case. Search for the end of the
4162range, which is either the end of the input range, or a character that has zero
4163or more than one other cases. */
4164
4165*ocptr = othercase;
4166next = othercase + 1;
4167
4168for (++c; c <= d; c++)
4169 {
4170 if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4171 next++;
4172 }
4173
4174*odptr = next - 1; /* End of othercase range */
4175*cptr = c; /* Rest of input range */
4176return 0;
4177}
4178#endif /* SUPPORT_UCP */
4179
4180
4181
4182/*************************************************
4183* Add a character or range to a class *
4184*************************************************/
4185
4186/* This function packages up the logic of adding a character or range of
4187characters to a class. The character values in the arguments will be within the
4188valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4189mutually recursive with the function immediately below.
4190
4191Arguments:
4192 classbits the bit map for characters < 256
4193 uchardptr points to the pointer for extra data
4194 options the options word
4195 cd contains pointers to tables etc.
4196 start start of range character
4197 end end of range character
4198
4199Returns: the number of < 256 characters added
4200 the pointer to extra data is updated
4201*/
4202
4203static int
4204add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4205 compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4206{
4207pcre_uint32 c;
4208pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4209int n8 = 0;
4210
4211/* If caseless matching is required, scan the range and process alternate
4212cases. In Unicode, there are 8-bit characters that have alternate cases that
4213are greater than 255 and vice-versa. Sometimes we can just extend the original
4214range. */
4215
4216if ((options & PCRE_CASELESS) != 0)
4217 {
4218#ifdef SUPPORT_UCP
4219 if ((options & PCRE_UTF8) != 0)
4220 {
4221 int rc;
4222 pcre_uint32 oc, od;
4223
4224 options &= ~PCRE_CASELESS; /* Remove for recursive calls */
4225 c = start;
4226
4227 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4228 {
4229 /* Handle a single character that has more than one other case. */
4230
4231 if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4232 PRIV(ucd_caseless_sets) + rc, oc);
4233
4234 /* Do nothing if the other case range is within the original range. */
4235
4236 else if (oc >= start && od <= end) continue;
4237
4238 /* Extend the original range if there is overlap, noting that if oc < c, we
4239 can't have od > end because a subrange is always shorter than the basic
4240 range. Otherwise, use a recursive call to add the additional range. */
4241
4242 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4243 else if (od > end && oc <= end + 1)
4244 {
4245 end = od; /* Extend upwards */
4246 if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4247 }
4248 else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4249 }
4250 }
4251 else
4252#endif /* SUPPORT_UCP */
4253
4254 /* Not UTF-mode, or no UCP */
4255
4256 for (c = start; c <= classbits_end; c++)
4257 {
4258 SETBIT(classbits, cd->fcc[c]);
4259 n8++;
4260 }
4261 }
4262
4263/* Now handle the original range. Adjust the final value according to the bit
4264length - this means that the same lists of (e.g.) horizontal spaces can be used
4265in all cases. */
4266
4267#if defined COMPILE_PCRE8
4268#ifdef SUPPORT_UTF
4269 if ((options & PCRE_UTF8) == 0)
4270#endif
4271 if (end > 0xff) end = 0xff;
4272
4273#elif defined COMPILE_PCRE16
4274#ifdef SUPPORT_UTF
4275 if ((options & PCRE_UTF16) == 0)
4276#endif
4277 if (end > 0xffff) end = 0xffff;
4278
4279#endif /* COMPILE_PCRE[8|16] */
4280
4281/* Use the bitmap for characters < 256. Otherwise use extra data.*/
4282
4283for (c = start; c <= classbits_end; c++)
4284 {
4285 /* Regardless of start, c will always be <= 255. */
4286 SETBIT(classbits, c);
4287 n8++;
4288 }
4289
4290#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4291if (start <= 0xff) start = 0xff + 1;
4292
4293if (end >= start)
4294 {
4295 pcre_uchar *uchardata = *uchardptr;
4296#ifdef SUPPORT_UTF
4297 if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */
4298 {
4299 if (start < end)
4300 {
4301 *uchardata++ = XCL_RANGE;
4302 uchardata += PRIV(ord2utf)(start, uchardata);
4303 uchardata += PRIV(ord2utf)(end, uchardata);
4304 }
4305 else if (start == end)
4306 {
4307 *uchardata++ = XCL_SINGLE;
4308 uchardata += PRIV(ord2utf)(start, uchardata);
4309 }
4310 }
4311 else
4312#endif /* SUPPORT_UTF */
4313
4314 /* Without UTF support, character values are constrained by the bit length,
4315 and can only be > 256 for 16-bit and 32-bit libraries. */
4316
4317#ifdef COMPILE_PCRE8
4318 {}
4319#else
4320 if (start < end)
4321 {
4322 *uchardata++ = XCL_RANGE;
4323 *uchardata++ = start;
4324 *uchardata++ = end;
4325 }
4326 else if (start == end)
4327 {
4328 *uchardata++ = XCL_SINGLE;
4329 *uchardata++ = start;
4330 }
4331#endif
4332
4333 *uchardptr = uchardata; /* Updata extra data pointer */
4334 }
4335#endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4336
4337return n8; /* Number of 8-bit characters */
4338}
4339
4340
4341
4342
4343/*************************************************
4344* Add a list of characters to a class *
4345*************************************************/
4346
4347/* This function is used for adding a list of case-equivalent characters to a
4348class, and also for adding a list of horizontal or vertical whitespace. If the
4349list is in order (which it should be), ranges of characters are detected and
4350handled appropriately. This function is mutually recursive with the function
4351above.
4352
4353Arguments:
4354 classbits the bit map for characters < 256
4355 uchardptr points to the pointer for extra data
4356 options the options word
4357 cd contains pointers to tables etc.
4358 p points to row of 32-bit values, terminated by NOTACHAR
4359 except character to omit; this is used when adding lists of
4360 case-equivalent characters to avoid including the one we
4361 already know about
4362
4363Returns: the number of < 256 characters added
4364 the pointer to extra data is updated
4365*/
4366
4367static int
4368add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4369 compile_data *cd, const pcre_uint32 *p, unsigned int except)
4370{
4371int n8 = 0;
4372while (p[0] < NOTACHAR)
4373 {
4374 int n = 0;
4375 if (p[0] != except)
4376 {
4377 while(p[n+1] == p[0] + n + 1) n++;
4378 n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4379 }
4380 p += n + 1;
4381 }
4382return n8;
4383}
4384
4385
4386
4387/*************************************************
4388* Add characters not in a list to a class *
4389*************************************************/
4390
4391/* This function is used for adding the complement of a list of horizontal or
4392vertical whitespace to a class. The list must be in order.
4393
4394Arguments:
4395 classbits the bit map for characters < 256
4396 uchardptr points to the pointer for extra data
4397 options the options word
4398 cd contains pointers to tables etc.
4399 p points to row of 32-bit values, terminated by NOTACHAR
4400
4401Returns: the number of < 256 characters added
4402 the pointer to extra data is updated
4403*/
4404
4405static int
4406add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4407 int options, compile_data *cd, const pcre_uint32 *p)
4408{
4409BOOL utf = (options & PCRE_UTF8) != 0;
4410int n8 = 0;
4411if (p[0] > 0)
4412 n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4413while (p[0] < NOTACHAR)
4414 {
4415 while (p[1] == p[0] + 1) p++;
4416 n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4417 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4418 p++;
4419 }
4420return n8;
4421}
4422
4423
4424
4425/*************************************************
4426* Compile one branch *
4427*************************************************/
4428
4429/* Scan the pattern, compiling it into the a vector. If the options are
4430changed during the branch, the pointer is used to change the external options
4431bits. This function is used during the pre-compile phase when we are trying
4432to find out the amount of memory needed, as well as during the real compile
4433phase. The value of lengthptr distinguishes the two phases.
4434
4435Arguments:
4436 optionsptr pointer to the option bits
4437 codeptr points to the pointer to the current code point
4438 ptrptr points to the current pattern pointer
4439 errorcodeptr points to error code variable
4440 firstcharptr place to put the first required character
4441 firstcharflagsptr place to put the first character flags, or a negative number
4442 reqcharptr place to put the last required character
4443 reqcharflagsptr place to put the last required character flags, or a negative number
4444 bcptr points to current branch chain
4445 cond_depth conditional nesting depth
4446 cd contains pointers to tables etc.
4447 lengthptr NULL during the real compile phase
4448 points to length accumulator during pre-compile phase
4449
4450Returns: TRUE on success
4451 FALSE, with *errorcodeptr set non-zero on error
4452*/
4453
4454static BOOL
4455compile_branch(int *optionsptr, pcre_uchar **codeptr,
4456 const pcre_uchar **ptrptr, int *errorcodeptr,
4457 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4458 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4459 branch_chain *bcptr, int cond_depth,
4460 compile_data *cd, int *lengthptr)
4461{
4462int repeat_type, op_type;
4463int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
4464int bravalue = 0;
4465int greedy_default, greedy_non_default;
4466pcre_uint32 firstchar, reqchar;
4467pcre_int32 firstcharflags, reqcharflags;
4468pcre_uint32 zeroreqchar, zerofirstchar;
4469pcre_int32 zeroreqcharflags, zerofirstcharflags;
4470pcre_int32 req_caseopt, reqvary, tempreqvary;
4471int options = *optionsptr; /* May change dynamically */
4472int after_manual_callout = 0;
4473int length_prevgroup = 0;
4474register pcre_uint32 c;
4475int escape;
4476register pcre_uchar *code = *codeptr;
4477pcre_uchar *last_code = code;
4478pcre_uchar *orig_code = code;
4479pcre_uchar *tempcode;
4480BOOL inescq = FALSE;
4481BOOL groupsetfirstchar = FALSE;
4482const pcre_uchar *ptr = *ptrptr;
4483const pcre_uchar *tempptr;
4484const pcre_uchar *nestptr = NULL;
4485pcre_uchar *previous = NULL;
4486pcre_uchar *previous_callout = NULL;
4487size_t item_hwm_offset = 0;
4488pcre_uint8 classbits[32];
4489
4490/* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4491must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4492dynamically as we process the pattern. */
4493
4494#ifdef SUPPORT_UTF
4495/* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4496BOOL utf = (options & PCRE_UTF8) != 0;
4497#ifndef COMPILE_PCRE32
4498pcre_uchar utf_chars[6];
4499#endif
4500#else
4501BOOL utf = FALSE;
4502#endif
4503
4504/* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4505class_uchardata always so that it can be passed to add_to_class() always,
4506though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4507alternative calls for the different cases. */
4508
4509pcre_uchar *class_uchardata;
4510#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4511BOOL xclass;
4512pcre_uchar *class_uchardata_base;
4513#endif
4514
4515#ifdef PCRE_DEBUG
4516if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4517#endif
4518
4519/* Set up the default and non-default settings for greediness */
4520
4521greedy_default = ((options & PCRE_UNGREEDY) != 0);
4522greedy_non_default = greedy_default ^ 1;
4523
4524/* Initialize no first byte, no required byte. REQ_UNSET means "no char
4525matching encountered yet". It gets changed to REQ_NONE if we hit something that
4526matches a non-fixed char first char; reqchar just remains unset if we never
4527find one.
4528
4529When we hit a repeat whose minimum is zero, we may have to adjust these values
4530to take the zero repeat into account. This is implemented by setting them to
4531zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4532item types that can be repeated set these backoff variables appropriately. */
4533
4534firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4535firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4536
4537/* The variable req_caseopt contains either the REQ_CASELESS value
4538or zero, according to the current setting of the caseless flag. The
4539REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4540firstchar or reqchar variables to record the case status of the
4541value. This is used only for ASCII characters. */
4542
4543req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4544
4545/* Switch on next character until the end of the branch */
4546
4547for (;; ptr++)
4548 {
4549 BOOL negate_class;
4550 BOOL should_flip_negation;
4551 BOOL possessive_quantifier;
4552 BOOL is_quantifier;
4553 BOOL is_recurse;
4554 BOOL reset_bracount;
4555 int class_has_8bitchar;
4556 int class_one_char;
4557#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4558 BOOL xclass_has_prop;
4559#endif
4560 int newoptions;
4561 int recno;
4562 int refsign;
4563 int skipbytes;
4564 pcre_uint32 subreqchar, subfirstchar;
4565 pcre_int32 subreqcharflags, subfirstcharflags;
4566 int terminator;
4567 unsigned int mclength;
4568 unsigned int tempbracount;
4569 pcre_uint32 ec;
4570 pcre_uchar mcbuffer[8];
4571
4572 /* Come here to restart the loop without advancing the pointer. */
4573
4574 REDO_LOOP:
4575
4576 /* Get next character in the pattern */
4577
4578 c = *ptr;
4579
4580 /* If we are at the end of a nested substitution, revert to the outer level
4581 string. Nesting only happens one level deep. */
4582
4583 if (c == CHAR_NULL && nestptr != NULL)
4584 {
4585 ptr = nestptr;
4586 nestptr = NULL;
4587 c = *ptr;
4588 }
4589
4590 /* If we are in the pre-compile phase, accumulate the length used for the
4591 previous cycle of this loop. */
4592
4593 if (lengthptr != NULL)
4594 {
4595#ifdef PCRE_DEBUG
4596 if (code > cd->hwm) cd->hwm = code; /* High water info */
4597#endif
4598 if (code > cd->start_workspace + cd->workspace_size -
4599 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
4600 {
4601 *errorcodeptr = (code >= cd->start_workspace + cd->workspace_size)?
4602 ERR52 : ERR87;
4603 goto FAILED;
4604 }
4605
4606 /* There is at least one situation where code goes backwards: this is the
4607 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4608 the class is simply eliminated. However, it is created first, so we have to
4609 allow memory for it. Therefore, don't ever reduce the length at this point.
4610 */
4611
4612 if (code < last_code) code = last_code;
4613
4614 /* Paranoid check for integer overflow */
4615
4616 if (OFLOW_MAX - *lengthptr < code - last_code)
4617 {
4618 *errorcodeptr = ERR20;
4619 goto FAILED;
4620 }
4621
4622 *lengthptr += (int)(code - last_code);
4623 DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4624 (int)(code - last_code), c, c));
4625
4626 /* If "previous" is set and it is not at the start of the work space, move
4627 it back to there, in order to avoid filling up the work space. Otherwise,
4628 if "previous" is NULL, reset the current code pointer to the start. */
4629
4630 if (previous != NULL)
4631 {
4632 if (previous > orig_code)
4633 {
4634 memmove(orig_code, previous, IN_UCHARS(code - previous));
4635 code -= previous - orig_code;
4636 previous = orig_code;
4637 }
4638 }
4639 else code = orig_code;
4640
4641 /* Remember where this code item starts so we can pick up the length
4642 next time round. */
4643
4644 last_code = code;
4645 }
4646
4647 /* In the real compile phase, just check the workspace used by the forward
4648 reference list. */
4649
4650 else if (cd->hwm > cd->start_workspace + cd->workspace_size)
4651 {
4652 *errorcodeptr = ERR52;
4653 goto FAILED;
4654 }
4655
4656 /* If in \Q...\E, check for the end; if not, we have a literal. Otherwise an
4657 isolated \E is ignored. */
4658
4659 if (c != CHAR_NULL)
4660 {
4661 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4662 {
4663 inescq = FALSE;
4664 ptr++;
4665 continue;
4666 }
4667 else if (inescq)
4668 {
4669 if (previous_callout != NULL)
4670 {
4671 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4672 complete_callout(previous_callout, ptr, cd);
4673 previous_callout = NULL;
4674 }
4675 if ((options & PCRE_AUTO_CALLOUT) != 0)
4676 {
4677 previous_callout = code;
4678 code = auto_callout(code, ptr, cd);
4679 }
4680 goto NORMAL_CHAR;
4681 }
4682
4683 /* Check for the start of a \Q...\E sequence. We must do this here rather
4684 than later in case it is immediately followed by \E, which turns it into a
4685 "do nothing" sequence. */
4686
4687 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4688 {
4689 inescq = TRUE;
4690 ptr++;
4691 continue;
4692 }
4693 }
4694
4695 /* In extended mode, skip white space and comments. */
4696
4697 if ((options & PCRE_EXTENDED) != 0)
4698 {
4699 const pcre_uchar *wscptr = ptr;
4700 while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4701 if (c == CHAR_NUMBER_SIGN)
4702 {
4703 ptr++;
4704 while (*ptr != CHAR_NULL)
4705 {
4706 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
4707 { /* IS_NEWLINE sets cd->nllen. */
4708 ptr += cd->nllen;
4709 break;
4710 }
4711 ptr++;
4712#ifdef SUPPORT_UTF
4713 if (utf) FORWARDCHAR(ptr);
4714#endif
4715 }
4716 }
4717
4718 /* If we skipped any characters, restart the loop. Otherwise, we didn't see
4719 a comment. */
4720
4721 if (ptr > wscptr) goto REDO_LOOP;
4722 }
4723
4724 /* Skip over (?# comments. We need to do this here because we want to know if
4725 the next thing is a quantifier, and these comments may come between an item
4726 and its quantifier. */
4727
4728 if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK &&
4729 ptr[2] == CHAR_NUMBER_SIGN)
4730 {
4731 ptr += 3;
4732 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4733 if (*ptr == CHAR_NULL)
4734 {
4735 *errorcodeptr = ERR18;
4736 goto FAILED;
4737 }
4738 continue;
4739 }
4740
4741 /* See if the next thing is a quantifier. */
4742
4743 is_quantifier =
4744 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4745 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4746
4747 /* Fill in length of a previous callout, except when the next thing is a
4748 quantifier or when processing a property substitution string in UCP mode. */
4749
4750 if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4751 after_manual_callout-- <= 0)
4752 {
4753 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4754 complete_callout(previous_callout, ptr, cd);
4755 previous_callout = NULL;
4756 }
4757
4758 /* Create auto callout, except for quantifiers, or while processing property
4759 strings that are substituted for \w etc in UCP mode. */
4760
4761 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4762 {
4763 previous_callout = code;
4764 code = auto_callout(code, ptr, cd);
4765 }
4766
4767 /* Process the next pattern item. */
4768
4769 switch(c)
4770 {
4771 /* ===================================================================*/
4772 case CHAR_NULL: /* The branch terminates at string end */
4773 case CHAR_VERTICAL_LINE: /* or | or ) */
4774 case CHAR_RIGHT_PARENTHESIS:
4775 *firstcharptr = firstchar;
4776 *firstcharflagsptr = firstcharflags;
4777 *reqcharptr = reqchar;
4778 *reqcharflagsptr = reqcharflags;
4779 *codeptr = code;
4780 *ptrptr = ptr;
4781 if (lengthptr != NULL)
4782 {
4783 if (OFLOW_MAX - *lengthptr < code - last_code)
4784 {
4785 *errorcodeptr = ERR20;
4786 goto FAILED;
4787 }
4788 *lengthptr += (int)(code - last_code); /* To include callout length */
4789 DPRINTF((">> end branch\n"));
4790 }
4791 return TRUE;
4792
4793
4794 /* ===================================================================*/
4795 /* Handle single-character metacharacters. In multiline mode, ^ disables
4796 the setting of any following char as a first character. */
4797
4798 case CHAR_CIRCUMFLEX_ACCENT:
4799 previous = NULL;
4800 if ((options & PCRE_MULTILINE) != 0)
4801 {
4802 if (firstcharflags == REQ_UNSET)
4803 zerofirstcharflags = firstcharflags = REQ_NONE;
4804 *code++ = OP_CIRCM;
4805 }
4806 else *code++ = OP_CIRC;
4807 break;
4808
4809 case CHAR_DOLLAR_SIGN:
4810 previous = NULL;
4811 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4812 break;
4813
4814 /* There can never be a first char if '.' is first, whatever happens about
4815 repeats. The value of reqchar doesn't change either. */
4816
4817 case CHAR_DOT:
4818 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4819 zerofirstchar = firstchar;
4820 zerofirstcharflags = firstcharflags;
4821 zeroreqchar = reqchar;
4822 zeroreqcharflags = reqcharflags;
4823 previous = code;
4824 item_hwm_offset = cd->hwm - cd->start_workspace;
4825 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4826 break;
4827
4828
4829 /* ===================================================================*/
4830 /* Character classes. If the included characters are all < 256, we build a
4831 32-byte bitmap of the permitted characters, except in the special case
4832 where there is only one such character. For negated classes, we build the
4833 map as usual, then invert it at the end. However, we use a different opcode
4834 so that data characters > 255 can be handled correctly.
4835
4836 If the class contains characters outside the 0-255 range, a different
4837 opcode is compiled. It may optionally have a bit map for characters < 256,
4838 but those above are are explicitly listed afterwards. A flag byte tells
4839 whether the bitmap is present, and whether this is a negated class or not.
4840
4841 In JavaScript compatibility mode, an isolated ']' causes an error. In
4842 default (Perl) mode, it is treated as a data character. */
4843
4844 case CHAR_RIGHT_SQUARE_BRACKET:
4845 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4846 {
4847 *errorcodeptr = ERR64;
4848 goto FAILED;
4849 }
4850 goto NORMAL_CHAR;
4851
4852 /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4853 used for "start of word" and "end of word". As these are otherwise illegal
4854 sequences, we don't break anything by recognizing them. They are replaced
4855 by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4856 erroneous and are handled by the normal code below. */
4857
4858 case CHAR_LEFT_SQUARE_BRACKET:
4859 if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4860 {
4861 nestptr = ptr + 7;
4862 ptr = sub_start_of_word;
4863 goto REDO_LOOP;
4864 }
4865
4866 if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4867 {
4868 nestptr = ptr + 7;
4869 ptr = sub_end_of_word;
4870 goto REDO_LOOP;
4871 }
4872
4873 /* Handle a real character class. */
4874
4875 previous = code;
4876 item_hwm_offset = cd->hwm - cd->start_workspace;
4877
4878 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4879 they are encountered at the top level, so we'll do that too. */
4880
4881 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4882 ptr[1] == CHAR_EQUALS_SIGN) &&
4883 check_posix_syntax(ptr, &tempptr))
4884 {
4885 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4886 goto FAILED;
4887 }
4888
4889 /* If the first character is '^', set the negation flag and skip it. Also,
4890 if the first few characters (either before or after ^) are \Q\E or \E we
4891 skip them too. This makes for compatibility with Perl. */
4892
4893 negate_class = FALSE;
4894 for (;;)
4895 {
4896 c = *(++ptr);
4897 if (c == CHAR_BACKSLASH)
4898 {
4899 if (ptr[1] == CHAR_E)
4900 ptr++;
4901 else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4902 ptr += 3;
4903 else
4904 break;
4905 }
4906 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4907 negate_class = TRUE;
4908 else break;
4909 }
4910
4911 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4912 an initial ']' is taken as a data character -- the code below handles
4913 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4914 [^] must match any character, so generate OP_ALLANY. */
4915
4916 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4917 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4918 {
4919 *code++ = negate_class? OP_ALLANY : OP_FAIL;
4920 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4921 zerofirstchar = firstchar;
4922 zerofirstcharflags = firstcharflags;
4923 break;
4924 }
4925
4926 /* If a class contains a negative special such as \S, we need to flip the
4927 negation flag at the end, so that support for characters > 255 works
4928 correctly (they are all included in the class). */
4929
4930 should_flip_negation = FALSE;
4931
4932 /* Extended class (xclass) will be used when characters > 255
4933 might match. */
4934
4935#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4936 xclass = FALSE;
4937 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
4938 class_uchardata_base = class_uchardata; /* Save the start */
4939#endif
4940
4941 /* For optimization purposes, we track some properties of the class:
4942 class_has_8bitchar will be non-zero if the class contains at least one <
4943 256 character; class_one_char will be 1 if the class contains just one
4944 character; xclass_has_prop will be TRUE if unicode property checks
4945 are present in the class. */
4946
4947 class_has_8bitchar = 0;
4948 class_one_char = 0;
4949#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4950 xclass_has_prop = FALSE;
4951#endif
4952
4953 /* Initialize the 32-char bit map to all zeros. We build the map in a
4954 temporary bit of memory, in case the class contains fewer than two
4955 8-bit characters because in that case the compiled code doesn't use the bit
4956 map. */
4957
4958 memset(classbits, 0, 32 * sizeof(pcre_uint8));
4959
4960 /* Process characters until ] is reached. By writing this as a "do" it
4961 means that an initial ] is taken as a data character. At the start of the
4962 loop, c contains the first byte of the character. */
4963
4964 if (c != CHAR_NULL) do
4965 {
4966 const pcre_uchar *oldptr;
4967
4968#ifdef SUPPORT_UTF
4969 if (utf && HAS_EXTRALEN(c))
4970 { /* Braces are required because the */
4971 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
4972 }
4973#endif
4974
4975#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4976 /* In the pre-compile phase, accumulate the length of any extra
4977 data and reset the pointer. This is so that very large classes that
4978 contain a zillion > 255 characters no longer overwrite the work space
4979 (which is on the stack). We have to remember that there was XCLASS data,
4980 however. */
4981
4982 if (class_uchardata > class_uchardata_base) xclass = TRUE;
4983
4984 if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4985 {
4986 *lengthptr += (int)(class_uchardata - class_uchardata_base);
4987 class_uchardata = class_uchardata_base;
4988 }
4989#endif
4990
4991 /* Inside \Q...\E everything is literal except \E */
4992
4993 if (inescq)
4994 {
4995 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
4996 {
4997 inescq = FALSE; /* Reset literal state */
4998 ptr++; /* Skip the 'E' */
4999 continue; /* Carry on with next */
5000 }
5001 goto CHECK_RANGE; /* Could be range if \E follows */
5002 }
5003
5004 /* Handle POSIX class names. Perl allows a negation extension of the
5005 form [:^name:]. A square bracket that doesn't match the syntax is
5006 treated as a literal. We also recognize the POSIX constructions
5007 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
5008 5.6 and 5.8 do. */
5009
5010 if (c == CHAR_LEFT_SQUARE_BRACKET &&
5011 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5012 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
5013 {
5014 BOOL local_negate = FALSE;
5015 int posix_class, taboffset, tabopt;
5016 register const pcre_uint8 *cbits = cd->cbits;
5017 pcre_uint8 pbits[32];
5018
5019 if (ptr[1] != CHAR_COLON)
5020 {
5021 *errorcodeptr = ERR31;
5022 goto FAILED;
5023 }
5024
5025 ptr += 2;
5026 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
5027 {
5028 local_negate = TRUE;
5029 should_flip_negation = TRUE; /* Note negative special */
5030 ptr++;
5031 }
5032
5033 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
5034 if (posix_class < 0)
5035 {
5036 *errorcodeptr = ERR30;
5037 goto FAILED;
5038 }
5039
5040 /* If matching is caseless, upper and lower are converted to
5041 alpha. This relies on the fact that the class table starts with
5042 alpha, lower, upper as the first 3 entries. */
5043
5044 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
5045 posix_class = 0;
5046
5047 /* When PCRE_UCP is set, some of the POSIX classes are converted to
5048 different escape sequences that use Unicode properties \p or \P. Others
5049 that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
5050 directly. */
5051
5052#ifdef SUPPORT_UCP
5053 if ((options & PCRE_UCP) != 0)
5054 {
5055 unsigned int ptype = 0;
5056 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
5057
5058 /* The posix_substitutes table specifies which POSIX classes can be
5059 converted to \p or \P items. */
5060
5061 if (posix_substitutes[pc] != NULL)
5062 {
5063 nestptr = tempptr + 1;
5064 ptr = posix_substitutes[pc] - 1;
5065 continue;
5066 }
5067
5068 /* There are three other classes that generate special property calls
5069 that are recognized only in an XCLASS. */
5070
5071 else switch(posix_class)
5072 {
5073 case PC_GRAPH:
5074 ptype = PT_PXGRAPH;
5075 /* Fall through */
5076 case PC_PRINT:
5077 if (ptype == 0) ptype = PT_PXPRINT;
5078 /* Fall through */
5079 case PC_PUNCT:
5080 if (ptype == 0) ptype = PT_PXPUNCT;
5081 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5082 *class_uchardata++ = ptype;
5083 *class_uchardata++ = 0;
5084 xclass_has_prop = TRUE;
5085 ptr = tempptr + 1;
5086 continue;
5087
5088 /* For the other POSIX classes (ascii, cntrl, xdigit) we are going
5089 to fall through to the non-UCP case and build a bit map for
5090 characters with code points less than 256. If we are in a negated
5091 POSIX class, characters with code points greater than 255 must
5092 either all match or all not match. In the special case where we
5093 have not yet generated any xclass data, and this is the final item
5094 in the overall class, we need do nothing: later on, the opcode
5095 OP_NCLASS will be used to indicate that characters greater than 255
5096 are acceptable. If we have already seen an xclass item or one may
5097 follow (we have to assume that it might if this is not the end of
5098 the class), explicitly list all wide codepoints, which will then
5099 either not match or match, depending on whether the class is or is
5100 not negated. */
5101
5102 default:
5103 if (local_negate &&
5104 (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
5105 {
5106 *class_uchardata++ = XCL_RANGE;
5107 class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5108 class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5109 }
5110 break;
5111 }
5112 }
5113#endif
5114 /* In the non-UCP case, or when UCP makes no difference, we build the
5115 bit map for the POSIX class in a chunk of local store because we may be
5116 adding and subtracting from it, and we don't want to subtract bits that
5117 may be in the main map already. At the end we or the result into the
5118 bit map that is being built. */
5119
5120 posix_class *= 3;
5121
5122 /* Copy in the first table (always present) */
5123
5124 memcpy(pbits, cbits + posix_class_maps[posix_class],
5125 32 * sizeof(pcre_uint8));
5126
5127 /* If there is a second table, add or remove it as required. */
5128
5129 taboffset = posix_class_maps[posix_class + 1];
5130 tabopt = posix_class_maps[posix_class + 2];
5131
5132 if (taboffset >= 0)
5133 {
5134 if (tabopt >= 0)
5135 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
5136 else
5137 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
5138 }
5139
5140 /* Now see if we need to remove any special characters. An option
5141 value of 1 removes vertical space and 2 removes underscore. */
5142
5143 if (tabopt < 0) tabopt = -tabopt;
5144 if (tabopt == 1) pbits[1] &= ~0x3c;
5145 else if (tabopt == 2) pbits[11] &= 0x7f;
5146
5147 /* Add the POSIX table or its complement into the main table that is
5148 being built and we are done. */
5149
5150 if (local_negate)
5151 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
5152 else
5153 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
5154
5155 ptr = tempptr + 1;
5156 /* Every class contains at least one < 256 character. */
5157 class_has_8bitchar = 1;
5158 /* Every class contains at least two characters. */
5159 class_one_char = 2;
5160 continue; /* End of POSIX syntax handling */
5161 }
5162
5163 /* Backslash may introduce a single character, or it may introduce one
5164 of the specials, which just set a flag. The sequence \b is a special
5165 case. Inside a class (and only there) it is treated as backspace. We
5166 assume that other escapes have more than one character in them, so
5167 speculatively set both class_has_8bitchar and class_one_char bigger
5168 than one. Unrecognized escapes fall through and are either treated
5169 as literal characters (by default), or are faulted if
5170 PCRE_EXTRA is set. */
5171
5172 if (c == CHAR_BACKSLASH)
5173 {
5174 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
5175 TRUE);
5176 if (*errorcodeptr != 0) goto FAILED;
5177 if (escape == 0) c = ec;
5178 else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
5179 else if (escape == ESC_N) /* \N is not supported in a class */
5180 {
5181 *errorcodeptr = ERR71;
5182 goto FAILED;
5183 }
5184 else if (escape == ESC_Q) /* Handle start of quoted string */
5185 {
5186 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5187 {
5188 ptr += 2; /* avoid empty string */
5189 }
5190 else inescq = TRUE;
5191 continue;
5192 }
5193 else if (escape == ESC_E) continue; /* Ignore orphan \E */
5194
5195 else
5196 {
5197 register const pcre_uint8 *cbits = cd->cbits;
5198 /* Every class contains at least two < 256 characters. */
5199 class_has_8bitchar++;
5200 /* Every class contains at least two characters. */
5201 class_one_char += 2;
5202
5203 switch (escape)
5204 {
5205#ifdef SUPPORT_UCP
5206 case ESC_du: /* These are the values given for \d etc */
5207 case ESC_DU: /* when PCRE_UCP is set. We replace the */
5208 case ESC_wu: /* escape sequence with an appropriate \p */
5209 case ESC_WU: /* or \P to test Unicode properties instead */
5210 case ESC_su: /* of the default ASCII testing. */
5211 case ESC_SU:
5212 nestptr = ptr;
5213 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
5214 class_has_8bitchar--; /* Undo! */
5215 continue;
5216#endif
5217 case ESC_d:
5218 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
5219 continue;
5220
5221 case ESC_D:
5222 should_flip_negation = TRUE;
5223 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5224 continue;
5225
5226 case ESC_w:
5227 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5228 continue;
5229
5230 case ESC_W:
5231 should_flip_negation = TRUE;
5232 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5233 continue;
5234
5235 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5236 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5237 previously set by something earlier in the character class.
5238 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5239 we could just adjust the appropriate bit. From PCRE 8.34 we no
5240 longer treat \s and \S specially. */
5241
5242 case ESC_s:
5243 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5244 continue;
5245
5246 case ESC_S:
5247 should_flip_negation = TRUE;
5248 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5249 continue;
5250
5251 /* The rest apply in both UCP and non-UCP cases. */
5252
5253 case ESC_h:
5254 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5255 PRIV(hspace_list), NOTACHAR);
5256 continue;
5257
5258 case ESC_H:
5259 (void)add_not_list_to_class(classbits, &class_uchardata, options,
5260 cd, PRIV(hspace_list));
5261 continue;
5262
5263 case ESC_v:
5264 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5265 PRIV(vspace_list), NOTACHAR);
5266 continue;
5267
5268 case ESC_V:
5269 (void)add_not_list_to_class(classbits, &class_uchardata, options,
5270 cd, PRIV(vspace_list));
5271 continue;
5272
5273 case ESC_p:
5274 case ESC_P:
5275#ifdef SUPPORT_UCP
5276 {
5277 BOOL negated;
5278 unsigned int ptype = 0, pdata = 0;
5279 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5280 goto FAILED;
5281 *class_uchardata++ = ((escape == ESC_p) != negated)?
5282 XCL_PROP : XCL_NOTPROP;
5283 *class_uchardata++ = ptype;
5284 *class_uchardata++ = pdata;
5285 xclass_has_prop = TRUE;
5286 class_has_8bitchar--; /* Undo! */
5287 continue;
5288 }
5289#else
5290 *errorcodeptr = ERR45;
5291 goto FAILED;
5292#endif
5293 /* Unrecognized escapes are faulted if PCRE is running in its
5294 strict mode. By default, for compatibility with Perl, they are
5295 treated as literals. */
5296
5297 default:
5298 if ((options & PCRE_EXTRA) != 0)
5299 {
5300 *errorcodeptr = ERR7;
5301 goto FAILED;
5302 }
5303 class_has_8bitchar--; /* Undo the speculative increase. */
5304 class_one_char -= 2; /* Undo the speculative increase. */
5305 c = *ptr; /* Get the final character and fall through */
5306 break;
5307 }
5308 }
5309
5310 /* Fall through if the escape just defined a single character (c >= 0).
5311 This may be greater than 256. */
5312
5313 escape = 0;
5314
5315 } /* End of backslash handling */
5316
5317 /* A character may be followed by '-' to form a range. However, Perl does
5318 not permit ']' to be the end of the range. A '-' character at the end is
5319 treated as a literal. Perl ignores orphaned \E sequences entirely. The
5320 code for handling \Q and \E is messy. */
5321
5322 CHECK_RANGE:
5323 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5324 {
5325 inescq = FALSE;
5326 ptr += 2;
5327 }
5328 oldptr = ptr;
5329
5330 /* Remember if \r or \n were explicitly used */
5331
5332 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5333
5334 /* Check for range */
5335
5336 if (!inescq && ptr[1] == CHAR_MINUS)
5337 {
5338 pcre_uint32 d;
5339 ptr += 2;
5340 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
5341
5342 /* If we hit \Q (not followed by \E) at this point, go into escaped
5343 mode. */
5344
5345 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
5346 {
5347 ptr += 2;
5348 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
5349 { ptr += 2; continue; }
5350 inescq = TRUE;
5351 break;
5352 }
5353
5354 /* Minus (hyphen) at the end of a class is treated as a literal, so put
5355 back the pointer and jump to handle the character that preceded it. */
5356
5357 if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
5358 {
5359 ptr = oldptr;
5360 goto CLASS_SINGLE_CHARACTER;
5361 }
5362
5363 /* Otherwise, we have a potential range; pick up the next character */
5364
5365#ifdef SUPPORT_UTF
5366 if (utf)
5367 { /* Braces are required because the */
5368 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
5369 }
5370 else
5371#endif
5372 d = *ptr; /* Not UTF-8 mode */
5373
5374 /* The second part of a range can be a single-character escape
5375 sequence, but not any of the other escapes. Perl treats a hyphen as a
5376 literal in such circumstances. However, in Perl's warning mode, a
5377 warning is given, so PCRE now faults it as it is almost certainly a
5378 mistake on the user's part. */
5379
5380 if (!inescq)
5381 {
5382 if (d == CHAR_BACKSLASH)
5383 {
5384 int descape;
5385 descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5386 if (*errorcodeptr != 0) goto FAILED;
5387
5388 /* 0 means a character was put into d; \b is backspace; any other
5389 special causes an error. */
5390
5391 if (descape != 0)
5392 {
5393 if (descape == ESC_b) d = CHAR_BS; else
5394 {
5395 *errorcodeptr = ERR83;
5396 goto FAILED;
5397 }
5398 }
5399 }
5400
5401 /* A hyphen followed by a POSIX class is treated in the same way. */
5402
5403 else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5404 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5405 ptr[1] == CHAR_EQUALS_SIGN) &&
5406 check_posix_syntax(ptr, &tempptr))
5407 {
5408 *errorcodeptr = ERR83;
5409 goto FAILED;
5410 }
5411 }
5412
5413 /* Check that the two values are in the correct order. Optimize
5414 one-character ranges. */
5415
5416 if (d < c)
5417 {
5418 *errorcodeptr = ERR8;
5419 goto FAILED;
5420 }
5421 if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */
5422
5423 /* We have found a character range, so single character optimizations
5424 cannot be done anymore. Any value greater than 1 indicates that there
5425 is more than one character. */
5426
5427 class_one_char = 2;
5428
5429 /* Remember an explicit \r or \n, and add the range to the class. */
5430
5431 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5432
5433 class_has_8bitchar +=
5434 add_to_class(classbits, &class_uchardata, options, cd, c, d);
5435
5436 continue; /* Go get the next char in the class */
5437 }
5438
5439 /* Handle a single character - we can get here for a normal non-escape
5440 char, or after \ that introduces a single character or for an apparent
5441 range that isn't. Only the value 1 matters for class_one_char, so don't
5442 increase it if it is already 2 or more ... just in case there's a class
5443 with a zillion characters in it. */
5444
5445 CLASS_SINGLE_CHARACTER:
5446 if (class_one_char < 2) class_one_char++;
5447
5448 /* If xclass_has_prop is false and class_one_char is 1, we have the first
5449 single character in the class, and there have been no prior ranges, or
5450 XCLASS items generated by escapes. If this is the final character in the
5451 class, we can optimize by turning the item into a 1-character OP_CHAR[I]
5452 if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
5453 can cause firstchar to be set. Otherwise, there can be no first char if
5454 this item is first, whatever repeat count may follow. In the case of
5455 reqchar, save the previous value for reinstating. */
5456
5457 if (!inescq &&
5458#ifdef SUPPORT_UCP
5459 !xclass_has_prop &&
5460#endif
5461 class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5462 {
5463 ptr++;
5464 zeroreqchar = reqchar;
5465 zeroreqcharflags = reqcharflags;
5466
5467 if (negate_class)
5468 {
5469#ifdef SUPPORT_UCP
5470 int d;
5471#endif
5472 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5473 zerofirstchar = firstchar;
5474 zerofirstcharflags = firstcharflags;
5475
5476 /* For caseless UTF-8 mode when UCP support is available, check
5477 whether this character has more than one other case. If so, generate
5478 a special OP_NOTPROP item instead of OP_NOTI. */
5479
5480#ifdef SUPPORT_UCP
5481 if (utf && (options & PCRE_CASELESS) != 0 &&
5482 (d = UCD_CASESET(c)) != 0)
5483 {
5484 *code++ = OP_NOTPROP;
5485 *code++ = PT_CLIST;
5486 *code++ = d;
5487 }
5488 else
5489#endif
5490 /* Char has only one other case, or UCP not available */
5491
5492 {
5493 *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5494#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5495 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5496 code += PRIV(ord2utf)(c, code);
5497 else
5498#endif
5499 *code++ = c;
5500 }
5501
5502 /* We are finished with this character class */
5503
5504 goto END_CLASS;
5505 }
5506
5507 /* For a single, positive character, get the value into mcbuffer, and
5508 then we can handle this with the normal one-character code. */
5509
5510#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5511 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5512 mclength = PRIV(ord2utf)(c, mcbuffer);
5513 else
5514#endif
5515 {
5516 mcbuffer[0] = c;
5517 mclength = 1;
5518 }
5519 goto ONE_CHAR;
5520 } /* End of 1-char optimization */
5521
5522 /* There is more than one character in the class, or an XCLASS item
5523 has been generated. Add this character to the class. */
5524
5525 class_has_8bitchar +=
5526 add_to_class(classbits, &class_uchardata, options, cd, c, c);
5527 }
5528
5529 /* Loop until ']' reached. This "while" is the end of the "do" far above.
5530 If we are at the end of an internal nested string, revert to the outer
5531 string. */
5532
5533 while (((c = *(++ptr)) != CHAR_NULL ||
5534 (nestptr != NULL &&
5535 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5536 (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5537
5538 /* Check for missing terminating ']' */
5539
5540 if (c == CHAR_NULL)
5541 {
5542 *errorcodeptr = ERR6;
5543 goto FAILED;
5544 }
5545
5546 /* We will need an XCLASS if data has been placed in class_uchardata. In
5547 the second phase this is a sufficient test. However, in the pre-compile
5548 phase, class_uchardata gets emptied to prevent workspace overflow, so it
5549 only if the very last character in the class needs XCLASS will it contain
5550 anything at this point. For this reason, xclass gets set TRUE above when
5551 uchar_classdata is emptied, and that's why this code is the way it is here
5552 instead of just doing a test on class_uchardata below. */
5553
5554#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5555 if (class_uchardata > class_uchardata_base) xclass = TRUE;
5556#endif
5557
5558 /* If this is the first thing in the branch, there can be no first char
5559 setting, whatever the repeat count. Any reqchar setting must remain
5560 unchanged after any kind of repeat. */
5561
5562 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5563 zerofirstchar = firstchar;
5564 zerofirstcharflags = firstcharflags;
5565 zeroreqchar = reqchar;
5566 zeroreqcharflags = reqcharflags;
5567
5568 /* If there are characters with values > 255, we have to compile an
5569 extended class, with its own opcode, unless there was a negated special
5570 such as \S in the class, and PCRE_UCP is not set, because in that case all
5571 characters > 255 are in the class, so any that were explicitly given as
5572 well can be ignored. If (when there are explicit characters > 255 that must
5573 be listed) there are no characters < 256, we can omit the bitmap in the
5574 actual compiled code. */
5575
5576#ifdef SUPPORT_UTF
5577 if (xclass && (xclass_has_prop || !should_flip_negation ||
5578 (options & PCRE_UCP) != 0))
5579#elif !defined COMPILE_PCRE8
5580 if (xclass && (xclass_has_prop || !should_flip_negation))
5581#endif
5582#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5583 {
5584 /* For non-UCP wide characters, in a non-negative class containing \S or
5585 similar (should_flip_negation is set), all characters greater than 255
5586 must be in the class. */
5587
5588 if (
5589#if defined COMPILE_PCRE8
5590 utf &&
5591#endif
5592 should_flip_negation && !negate_class && (options & PCRE_UCP) == 0)
5593 {
5594 *class_uchardata++ = XCL_RANGE;
5595 if (utf) /* Will always be utf in the 8-bit library */
5596 {
5597 class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5598 class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5599 }
5600 else /* Can only happen for the 16-bit & 32-bit libraries */
5601 {
5602#if defined COMPILE_PCRE16
5603 *class_uchardata++ = 0x100;
5604 *class_uchardata++ = 0xffffu;
5605#elif defined COMPILE_PCRE32
5606 *class_uchardata++ = 0x100;
5607 *class_uchardata++ = 0xffffffffu;
5608#endif
5609 }
5610 }
5611
5612 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
5613 *code++ = OP_XCLASS;
5614 code += LINK_SIZE;
5615 *code = negate_class? XCL_NOT:0;
5616 if (xclass_has_prop) *code |= XCL_HASPROP;
5617
5618 /* If the map is required, move up the extra data to make room for it;
5619 otherwise just move the code pointer to the end of the extra data. */
5620
5621 if (class_has_8bitchar > 0)
5622 {
5623 *code++ |= XCL_MAP;
5624 memmove(code + (32 / sizeof(pcre_uchar)), code,
5625 IN_UCHARS(class_uchardata - code));
5626 if (negate_class && !xclass_has_prop)
5627 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5628 memcpy(code, classbits, 32);
5629 code = class_uchardata + (32 / sizeof(pcre_uchar));
5630 }
5631 else code = class_uchardata;
5632
5633 /* Now fill in the complete length of the item */
5634
5635 PUT(previous, 1, (int)(code - previous));
5636 break; /* End of class handling */
5637 }
5638
5639 /* Even though any XCLASS list is now discarded, we must allow for
5640 its memory. */
5641
5642 if (lengthptr != NULL)
5643 *lengthptr += (int)(class_uchardata - class_uchardata_base);
5644#endif
5645
5646 /* If there are no characters > 255, or they are all to be included or
5647 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5648 whole class was negated and whether there were negative specials such as \S
5649 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5650 negating it if necessary. */
5651
5652 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5653 if (lengthptr == NULL) /* Save time in the pre-compile phase */
5654 {
5655 if (negate_class)
5656 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5657 memcpy(code, classbits, 32);
5658 }
5659 code += 32 / sizeof(pcre_uchar);
5660
5661 END_CLASS:
5662 break;
5663
5664
5665 /* ===================================================================*/
5666 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5667 has been tested above. */
5668
5669 case CHAR_LEFT_CURLY_BRACKET:
5670 if (!is_quantifier) goto NORMAL_CHAR;
5671 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5672 if (*errorcodeptr != 0) goto FAILED;
5673 goto REPEAT;
5674
5675 case CHAR_ASTERISK:
5676 repeat_min = 0;
5677 repeat_max = -1;
5678 goto REPEAT;
5679
5680 case CHAR_PLUS:
5681 repeat_min = 1;
5682 repeat_max = -1;
5683 goto REPEAT;
5684
5685 case CHAR_QUESTION_MARK:
5686 repeat_min = 0;
5687 repeat_max = 1;
5688
5689 REPEAT:
5690 if (previous == NULL)
5691 {
5692 *errorcodeptr = ERR9;
5693 goto FAILED;
5694 }
5695
5696 if (repeat_min == 0)
5697 {
5698 firstchar = zerofirstchar; /* Adjust for zero repeat */
5699 firstcharflags = zerofirstcharflags;
5700 reqchar = zeroreqchar; /* Ditto */
5701 reqcharflags = zeroreqcharflags;
5702 }
5703
5704 /* Remember whether this is a variable length repeat */
5705
5706 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5707
5708 op_type = 0; /* Default single-char op codes */
5709 possessive_quantifier = FALSE; /* Default not possessive quantifier */
5710
5711 /* Save start of previous item, in case we have to move it up in order to
5712 insert something before it. */
5713
5714 tempcode = previous;
5715
5716 /* Before checking for a possessive quantifier, we must skip over
5717 whitespace and comments in extended mode because Perl allows white space at
5718 this point. */
5719
5720 if ((options & PCRE_EXTENDED) != 0)
5721 {
5722 const pcre_uchar *p = ptr + 1;
5723 for (;;)
5724 {
5725 while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5726 if (*p != CHAR_NUMBER_SIGN) break;
5727 p++;
5728 while (*p != CHAR_NULL)
5729 {
5730 if (IS_NEWLINE(p)) /* For non-fixed-length newline cases, */
5731 { /* IS_NEWLINE sets cd->nllen. */
5732 p += cd->nllen;
5733 break;
5734 }
5735 p++;
5736#ifdef SUPPORT_UTF
5737 if (utf) FORWARDCHAR(p);
5738#endif
5739 } /* Loop for comment characters */
5740 } /* Loop for multiple comments */
5741 ptr = p - 1; /* Character before the next significant one. */
5742 }
5743
5744 /* If the next character is '+', we have a possessive quantifier. This
5745 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5746 If the next character is '?' this is a minimizing repeat, by default,
5747 but if PCRE_UNGREEDY is set, it works the other way round. We change the
5748 repeat type to the non-default. */
5749
5750 if (ptr[1] == CHAR_PLUS)
5751 {
5752 repeat_type = 0; /* Force greedy */
5753 possessive_quantifier = TRUE;
5754 ptr++;
5755 }
5756 else if (ptr[1] == CHAR_QUESTION_MARK)
5757 {
5758 repeat_type = greedy_non_default;
5759 ptr++;
5760 }
5761 else repeat_type = greedy_default;
5762
5763 /* If previous was a recursion call, wrap it in atomic brackets so that
5764 previous becomes the atomic group. All recursions were so wrapped in the
5765 past, but it no longer happens for non-repeated recursions. In fact, the
5766 repeated ones could be re-implemented independently so as not to need this,
5767 but for the moment we rely on the code for repeating groups. */
5768
5769 if (*previous == OP_RECURSE)
5770 {
5771 memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5772 *previous = OP_ONCE;
5773 PUT(previous, 1, 2 + 2*LINK_SIZE);
5774 previous[2 + 2*LINK_SIZE] = OP_KET;
5775 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5776 code += 2 + 2 * LINK_SIZE;
5777 length_prevgroup = 3 + 3*LINK_SIZE;
5778
5779 /* When actually compiling, we need to check whether this was a forward
5780 reference, and if so, adjust the offset. */
5781
5782 if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5783 {
5784 int offset = GET(cd->hwm, -LINK_SIZE);
5785 if (offset == previous + 1 - cd->start_code)
5786 PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5787 }
5788 }
5789
5790 /* Now handle repetition for the different types of item. */
5791
5792 /* If previous was a character or negated character match, abolish the item
5793 and generate a repeat item instead. If a char item has a minimum of more
5794 than one, ensure that it is set in reqchar - it might not be if a sequence
5795 such as x{3} is the first thing in a branch because the x will have gone
5796 into firstchar instead. */
5797
5798 if (*previous == OP_CHAR || *previous == OP_CHARI
5799 || *previous == OP_NOT || *previous == OP_NOTI)
5800 {
5801 switch (*previous)
5802 {
5803 default: /* Make compiler happy. */
5804 case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
5805 case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5806 case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
5807 case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
5808 }
5809
5810 /* Deal with UTF characters that take up more than one character. It's
5811 easier to write this out separately than try to macrify it. Use c to
5812 hold the length of the character in bytes, plus UTF_LENGTH to flag that
5813 it's a length rather than a small character. */
5814
5815#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5816 if (utf && NOT_FIRSTCHAR(code[-1]))
5817 {
5818 pcre_uchar *lastchar = code - 1;
5819 BACKCHAR(lastchar);
5820 c = (int)(code - lastchar); /* Length of UTF-8 character */
5821 memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5822 c |= UTF_LENGTH; /* Flag c as a length */
5823 }
5824 else
5825#endif /* SUPPORT_UTF */
5826
5827 /* Handle the case of a single charater - either with no UTF support, or
5828 with UTF disabled, or for a single character UTF character. */
5829 {
5830 c = code[-1];
5831 if (*previous <= OP_CHARI && repeat_min > 1)
5832 {
5833 reqchar = c;
5834 reqcharflags = req_caseopt | cd->req_varyopt;
5835 }
5836 }
5837
5838 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
5839 }
5840
5841 /* If previous was a character type match (\d or similar), abolish it and
5842 create a suitable repeat item. The code is shared with single-character
5843 repeats by setting op_type to add a suitable offset into repeat_type. Note
5844 the the Unicode property types will be present only when SUPPORT_UCP is
5845 defined, but we don't wrap the little bits of code here because it just
5846 makes it horribly messy. */
5847
5848 else if (*previous < OP_EODN)
5849 {
5850 pcre_uchar *oldcode;
5851 int prop_type, prop_value;
5852 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
5853 c = *previous;
5854
5855 OUTPUT_SINGLE_REPEAT:
5856 if (*previous == OP_PROP || *previous == OP_NOTPROP)
5857 {
5858 prop_type = previous[1];
5859 prop_value = previous[2];
5860 }
5861 else prop_type = prop_value = -1;
5862
5863 oldcode = code;
5864 code = previous; /* Usually overwrite previous item */
5865
5866 /* If the maximum is zero then the minimum must also be zero; Perl allows
5867 this case, so we do too - by simply omitting the item altogether. */
5868
5869 if (repeat_max == 0) goto END_REPEAT;
5870
5871 /* Combine the op_type with the repeat_type */
5872
5873 repeat_type += op_type;
5874
5875 /* A minimum of zero is handled either as the special case * or ?, or as
5876 an UPTO, with the maximum given. */
5877
5878 if (repeat_min == 0)
5879 {
5880 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5881 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5882 else
5883 {
5884 *code++ = OP_UPTO + repeat_type;
5885 PUT2INC(code, 0, repeat_max);
5886 }
5887 }
5888
5889 /* A repeat minimum of 1 is optimized into some special cases. If the
5890 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5891 left in place and, if the maximum is greater than 1, we use OP_UPTO with
5892 one less than the maximum. */
5893
5894 else if (repeat_min == 1)
5895 {
5896 if (repeat_max == -1)
5897 *code++ = OP_PLUS + repeat_type;
5898 else
5899 {
5900 code = oldcode; /* leave previous item in place */
5901 if (repeat_max == 1) goto END_REPEAT;
5902 *code++ = OP_UPTO + repeat_type;
5903 PUT2INC(code, 0, repeat_max - 1);
5904 }
5905 }
5906
5907 /* The case {n,n} is just an EXACT, while the general case {n,m} is
5908 handled as an EXACT followed by an UPTO. */
5909
5910 else
5911 {
5912 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
5913 PUT2INC(code, 0, repeat_min);
5914
5915 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5916 we have to insert the character for the previous code. For a repeated
5917 Unicode property match, there are two extra bytes that define the
5918 required property. In UTF-8 mode, long characters have their length in
5919 c, with the UTF_LENGTH bit as a flag. */
5920
5921 if (repeat_max < 0)
5922 {
5923#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5924 if (utf && (c & UTF_LENGTH) != 0)
5925 {
5926 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5927 code += c & 7;
5928 }
5929 else
5930#endif
5931 {
5932 *code++ = c;
5933 if (prop_type >= 0)
5934 {
5935 *code++ = prop_type;
5936 *code++ = prop_value;
5937 }
5938 }
5939 *code++ = OP_STAR + repeat_type;
5940 }
5941
5942 /* Else insert an UPTO if the max is greater than the min, again
5943 preceded by the character, for the previously inserted code. If the
5944 UPTO is just for 1 instance, we can use QUERY instead. */
5945
5946 else if (repeat_max != repeat_min)
5947 {
5948#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5949 if (utf && (c & UTF_LENGTH) != 0)
5950 {
5951 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5952 code += c & 7;
5953 }
5954 else
5955#endif
5956 *code++ = c;
5957 if (prop_type >= 0)
5958 {
5959 *code++ = prop_type;
5960 *code++ = prop_value;
5961 }
5962 repeat_max -= repeat_min;
5963
5964 if (repeat_max == 1)
5965 {
5966 *code++ = OP_QUERY + repeat_type;
5967 }
5968 else
5969 {
5970 *code++ = OP_UPTO + repeat_type;
5971 PUT2INC(code, 0, repeat_max);
5972 }
5973 }
5974 }
5975
5976 /* The character or character type itself comes last in all cases. */
5977
5978#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5979 if (utf && (c & UTF_LENGTH) != 0)
5980 {
5981 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5982 code += c & 7;
5983 }
5984 else
5985#endif
5986 *code++ = c;
5987
5988 /* For a repeated Unicode property match, there are two extra bytes that
5989 define the required property. */
5990
5991#ifdef SUPPORT_UCP
5992 if (prop_type >= 0)
5993 {
5994 *code++ = prop_type;
5995 *code++ = prop_value;
5996 }
5997#endif
5998 }
5999
6000 /* If previous was a character class or a back reference, we put the repeat
6001 stuff after it, but just skip the item if the repeat was {0,0}. */
6002
6003 else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
6004#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6005 *previous == OP_XCLASS ||
6006#endif
6007 *previous == OP_REF || *previous == OP_REFI ||
6008 *previous == OP_DNREF || *previous == OP_DNREFI)
6009 {
6010 if (repeat_max == 0)
6011 {
6012 code = previous;
6013 goto END_REPEAT;
6014 }
6015
6016 if (repeat_min == 0 && repeat_max == -1)
6017 *code++ = OP_CRSTAR + repeat_type;
6018 else if (repeat_min == 1 && repeat_max == -1)
6019 *code++ = OP_CRPLUS + repeat_type;
6020 else if (repeat_min == 0 && repeat_max == 1)
6021 *code++ = OP_CRQUERY + repeat_type;
6022 else
6023 {
6024 *code++ = OP_CRRANGE + repeat_type;
6025 PUT2INC(code, 0, repeat_min);
6026 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
6027 PUT2INC(code, 0, repeat_max);
6028 }
6029 }
6030
6031 /* If previous was a bracket group, we may have to replicate it in certain
6032 cases. Note that at this point we can encounter only the "basic" bracket
6033 opcodes such as BRA and CBRA, as this is the place where they get converted
6034 into the more special varieties such as BRAPOS and SBRA. A test for >=
6035 OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
6036 ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
6037 Originally, PCRE did not allow repetition of assertions, but now it does,
6038 for Perl compatibility. */
6039
6040 else if (*previous >= OP_ASSERT && *previous <= OP_COND)
6041 {
6042 register int i;
6043 int len = (int)(code - previous);
6044 size_t base_hwm_offset = item_hwm_offset;
6045 pcre_uchar *bralink = NULL;
6046 pcre_uchar *brazeroptr = NULL;
6047
6048 /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
6049 we just ignore the repeat. */
6050
6051 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
6052 goto END_REPEAT;
6053
6054 /* There is no sense in actually repeating assertions. The only potential
6055 use of repetition is in cases when the assertion is optional. Therefore,
6056 if the minimum is greater than zero, just ignore the repeat. If the
6057 maximum is not zero or one, set it to 1. */
6058
6059 if (*previous < OP_ONCE) /* Assertion */
6060 {
6061 if (repeat_min > 0) goto END_REPEAT;
6062 if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
6063 }
6064
6065 /* The case of a zero minimum is special because of the need to stick
6066 OP_BRAZERO in front of it, and because the group appears once in the
6067 data, whereas in other cases it appears the minimum number of times. For
6068 this reason, it is simplest to treat this case separately, as otherwise
6069 the code gets far too messy. There are several special subcases when the
6070 minimum is zero. */
6071
6072 if (repeat_min == 0)
6073 {
6074 /* If the maximum is also zero, we used to just omit the group from the
6075 output altogether, like this:
6076
6077 ** if (repeat_max == 0)
6078 ** {
6079 ** code = previous;
6080 ** goto END_REPEAT;
6081 ** }
6082
6083 However, that fails when a group or a subgroup within it is referenced
6084 as a subroutine from elsewhere in the pattern, so now we stick in
6085 OP_SKIPZERO in front of it so that it is skipped on execution. As we
6086 don't have a list of which groups are referenced, we cannot do this
6087 selectively.
6088
6089 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
6090 and do no more at this point. However, we do need to adjust any
6091 OP_RECURSE calls inside the group that refer to the group itself or any
6092 internal or forward referenced group, because the offset is from the
6093 start of the whole regex. Temporarily terminate the pattern while doing
6094 this. */
6095
6096 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
6097 {
6098 *code = OP_END;
6099 adjust_recurse(previous, 1, utf, cd, item_hwm_offset);
6100 memmove(previous + 1, previous, IN_UCHARS(len));
6101 code++;
6102 if (repeat_max == 0)
6103 {
6104 *previous++ = OP_SKIPZERO;
6105 goto END_REPEAT;
6106 }
6107 brazeroptr = previous; /* Save for possessive optimizing */
6108 *previous++ = OP_BRAZERO + repeat_type;
6109 }
6110
6111 /* If the maximum is greater than 1 and limited, we have to replicate
6112 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6113 The first one has to be handled carefully because it's the original
6114 copy, which has to be moved up. The remainder can be handled by code
6115 that is common with the non-zero minimum case below. We have to
6116 adjust the value or repeat_max, since one less copy is required. Once
6117 again, we may have to adjust any OP_RECURSE calls inside the group. */
6118
6119 else
6120 {
6121 int offset;
6122 *code = OP_END;
6123 adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset);
6124 memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
6125 code += 2 + LINK_SIZE;
6126 *previous++ = OP_BRAZERO + repeat_type;
6127 *previous++ = OP_BRA;
6128
6129 /* We chain together the bracket offset fields that have to be
6130 filled in later when the ends of the brackets are reached. */
6131
6132 offset = (bralink == NULL)? 0 : (int)(previous - bralink);
6133 bralink = previous;
6134 PUTINC(previous, 0, offset);
6135 }
6136
6137 repeat_max--;
6138 }
6139
6140 /* If the minimum is greater than zero, replicate the group as many
6141 times as necessary, and adjust the maximum to the number of subsequent
6142 copies that we need. If we set a first char from the group, and didn't
6143 set a required char, copy the latter from the former. If there are any
6144 forward reference subroutine calls in the group, there will be entries on
6145 the workspace list; replicate these with an appropriate increment. */
6146
6147 else
6148 {
6149 if (repeat_min > 1)
6150 {
6151 /* In the pre-compile phase, we don't actually do the replication. We
6152 just adjust the length as if we had. Do some paranoid checks for
6153 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6154 integer type when available, otherwise double. */
6155
6156 if (lengthptr != NULL)
6157 {
6158 int delta = (repeat_min - 1)*length_prevgroup;
6159 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
6160 (INT64_OR_DOUBLE)length_prevgroup >
6161 (INT64_OR_DOUBLE)INT_MAX ||
6162 OFLOW_MAX - *lengthptr < delta)
6163 {
6164 *errorcodeptr = ERR20;
6165 goto FAILED;
6166 }
6167 *lengthptr += delta;
6168 }
6169
6170 /* This is compiling for real. If there is a set first byte for
6171 the group, and we have not yet set a "required byte", set it. Make
6172 sure there is enough workspace for copying forward references before
6173 doing the copy. */
6174
6175 else
6176 {
6177 if (groupsetfirstchar && reqcharflags < 0)
6178 {
6179 reqchar = firstchar;
6180 reqcharflags = firstcharflags;
6181 }
6182
6183 for (i = 1; i < repeat_min; i++)
6184 {
6185 pcre_uchar *hc;
6186 size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6187 memcpy(code, previous, IN_UCHARS(len));
6188
6189 while (cd->hwm > cd->start_workspace + cd->workspace_size -
6190 WORK_SIZE_SAFETY_MARGIN -
6191 (this_hwm_offset - base_hwm_offset))
6192 {
6193 *errorcodeptr = expand_workspace(cd);
6194 if (*errorcodeptr != 0) goto FAILED;
6195 }
6196
6197 for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6198 hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6199 hc += LINK_SIZE)
6200 {
6201 PUT(cd->hwm, 0, GET(hc, 0) + len);
6202 cd->hwm += LINK_SIZE;
6203 }
6204 base_hwm_offset = this_hwm_offset;
6205 code += len;
6206 }
6207 }
6208 }
6209
6210 if (repeat_max > 0) repeat_max -= repeat_min;
6211 }
6212
6213 /* This code is common to both the zero and non-zero minimum cases. If
6214 the maximum is limited, it replicates the group in a nested fashion,
6215 remembering the bracket starts on a stack. In the case of a zero minimum,
6216 the first one was set up above. In all cases the repeat_max now specifies
6217 the number of additional copies needed. Again, we must remember to
6218 replicate entries on the forward reference list. */
6219
6220 if (repeat_max >= 0)
6221 {
6222 /* In the pre-compile phase, we don't actually do the replication. We
6223 just adjust the length as if we had. For each repetition we must add 1
6224 to the length for BRAZERO and for all but the last repetition we must
6225 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6226 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
6227 a 64-bit integer type when available, otherwise double. */
6228
6229 if (lengthptr != NULL && repeat_max > 0)
6230 {
6231 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6232 2 - 2*LINK_SIZE; /* Last one doesn't nest */
6233 if ((INT64_OR_DOUBLE)repeat_max *
6234 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6235 > (INT64_OR_DOUBLE)INT_MAX ||
6236 OFLOW_MAX - *lengthptr < delta)
6237 {
6238 *errorcodeptr = ERR20;
6239 goto FAILED;
6240 }
6241 *lengthptr += delta;
6242 }
6243
6244 /* This is compiling for real */
6245
6246 else for (i = repeat_max - 1; i >= 0; i--)
6247 {
6248 pcre_uchar *hc;
6249 size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6250
6251 *code++ = OP_BRAZERO + repeat_type;
6252
6253 /* All but the final copy start a new nesting, maintaining the
6254 chain of brackets outstanding. */
6255
6256 if (i != 0)
6257 {
6258 int offset;
6259 *code++ = OP_BRA;
6260 offset = (bralink == NULL)? 0 : (int)(code - bralink);
6261 bralink = code;
6262 PUTINC(code, 0, offset);
6263 }
6264
6265 memcpy(code, previous, IN_UCHARS(len));
6266
6267 /* Ensure there is enough workspace for forward references before
6268 copying them. */
6269
6270 while (cd->hwm > cd->start_workspace + cd->workspace_size -
6271 WORK_SIZE_SAFETY_MARGIN -
6272 (this_hwm_offset - base_hwm_offset))
6273 {
6274 *errorcodeptr = expand_workspace(cd);
6275 if (*errorcodeptr != 0) goto FAILED;
6276 }
6277
6278 for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6279 hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6280 hc += LINK_SIZE)
6281 {
6282 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
6283 cd->hwm += LINK_SIZE;
6284 }
6285 base_hwm_offset = this_hwm_offset;
6286 code += len;
6287 }
6288
6289 /* Now chain through the pending brackets, and fill in their length
6290 fields (which are holding the chain links pro tem). */
6291
6292 while (bralink != NULL)
6293 {
6294 int oldlinkoffset;
6295 int offset = (int)(code - bralink + 1);
6296 pcre_uchar *bra = code - offset;
6297 oldlinkoffset = GET(bra, 1);
6298 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6299 *code++ = OP_KET;
6300 PUTINC(code, 0, offset);
6301 PUT(bra, 1, offset);
6302 }
6303 }
6304
6305 /* If the maximum is unlimited, set a repeater in the final copy. For
6306 ONCE brackets, that's all we need to do. However, possessively repeated
6307 ONCE brackets can be converted into non-capturing brackets, as the
6308 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6309 deal with possessive ONCEs specially.
6310
6311 Otherwise, when we are doing the actual compile phase, check to see
6312 whether this group is one that could match an empty string. If so,
6313 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6314 that runtime checking can be done. [This check is also applied to ONCE
6315 groups at runtime, but in a different way.]
6316
6317 Then, if the quantifier was possessive and the bracket is not a
6318 conditional, we convert the BRA code to the POS form, and the KET code to
6319 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6320 subpattern at both the start and at the end.) The use of special opcodes
6321 makes it possible to reduce greatly the stack usage in pcre_exec(). If
6322 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6323
6324 Then, if the minimum number of matches is 1 or 0, cancel the possessive
6325 flag so that the default action below, of wrapping everything inside
6326 atomic brackets, does not happen. When the minimum is greater than 1,
6327 there will be earlier copies of the group, and so we still have to wrap
6328 the whole thing. */
6329
6330 else
6331 {
6332 pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6333 pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6334
6335 /* Convert possessive ONCE brackets to non-capturing */
6336
6337 if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
6338 possessive_quantifier) *bracode = OP_BRA;
6339
6340 /* For non-possessive ONCE brackets, all we need to do is to
6341 set the KET. */
6342
6343 if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
6344 *ketcode = OP_KETRMAX + repeat_type;
6345
6346 /* Handle non-ONCE brackets and possessive ONCEs (which have been
6347 converted to non-capturing above). */
6348
6349 else
6350 {
6351 /* In the compile phase, check for empty string matching. */
6352
6353 if (lengthptr == NULL)
6354 {
6355 pcre_uchar *scode = bracode;
6356 do
6357 {
6358 if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6359 {
6360 *bracode += OP_SBRA - OP_BRA;
6361 break;
6362 }
6363 scode += GET(scode, 1);
6364 }
6365 while (*scode == OP_ALT);
6366 }
6367
6368 /* A conditional group with only one branch has an implicit empty
6369 alternative branch. */
6370
6371 if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
6372 *bracode = OP_SCOND;
6373
6374 /* Handle possessive quantifiers. */
6375
6376 if (possessive_quantifier)
6377 {
6378 /* For COND brackets, we wrap the whole thing in a possessively
6379 repeated non-capturing bracket, because we have not invented POS
6380 versions of the COND opcodes. Because we are moving code along, we
6381 must ensure that any pending recursive references are updated. */
6382
6383 if (*bracode == OP_COND || *bracode == OP_SCOND)
6384 {
6385 int nlen = (int)(code - bracode);
6386 *code = OP_END;
6387 adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6388 memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6389 code += 1 + LINK_SIZE;
6390 nlen += 1 + LINK_SIZE;
6391 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
6392 *code++ = OP_KETRPOS;
6393 PUTINC(code, 0, nlen);
6394 PUT(bracode, 1, nlen);
6395 }
6396
6397 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6398
6399 else
6400 {
6401 *bracode += 1; /* Switch to xxxPOS opcodes */
6402 *ketcode = OP_KETRPOS;
6403 }
6404
6405 /* If the minimum is zero, mark it as possessive, then unset the
6406 possessive flag when the minimum is 0 or 1. */
6407
6408 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6409 if (repeat_min < 2) possessive_quantifier = FALSE;
6410 }
6411
6412 /* Non-possessive quantifier */
6413
6414 else *ketcode = OP_KETRMAX + repeat_type;
6415 }
6416 }
6417 }
6418
6419 /* If previous is OP_FAIL, it was generated by an empty class [] in
6420 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6421 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6422 error above. We can just ignore the repeat in JS case. */
6423
6424 else if (*previous == OP_FAIL) goto END_REPEAT;
6425
6426 /* Else there's some kind of shambles */
6427
6428 else
6429 {
6430 *errorcodeptr = ERR11;
6431 goto FAILED;
6432 }
6433
6434 /* If the character following a repeat is '+', possessive_quantifier is
6435 TRUE. For some opcodes, there are special alternative opcodes for this
6436 case. For anything else, we wrap the entire repeated item inside OP_ONCE
6437 brackets. Logically, the '+' notation is just syntactic sugar, taken from
6438 Sun's Java package, but the special opcodes can optimize it.
6439
6440 Some (but not all) possessively repeated subpatterns have already been
6441 completely handled in the code just above. For them, possessive_quantifier
6442 is always FALSE at this stage. Note that the repeated item starts at
6443 tempcode, not at previous, which might be the first part of a string whose
6444 (former) last char we repeated. */
6445
6446 if (possessive_quantifier)
6447 {
6448 int len;
6449
6450 /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6451 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6452 {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6453 remains is greater than zero, there's a further opcode that can be
6454 handled. If not, do nothing, leaving the EXACT alone. */
6455
6456 switch(*tempcode)
6457 {
6458 case OP_TYPEEXACT:
6459 tempcode += PRIV(OP_lengths)[*tempcode] +
6460 ((tempcode[1 + IMM2_SIZE] == OP_PROP
6461 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6462 break;
6463
6464 /* CHAR opcodes are used for exacts whose count is 1. */
6465
6466 case OP_CHAR:
6467 case OP_CHARI:
6468 case OP_NOT:
6469 case OP_NOTI:
6470 case OP_EXACT:
6471 case OP_EXACTI:
6472 case OP_NOTEXACT:
6473 case OP_NOTEXACTI:
6474 tempcode += PRIV(OP_lengths)[*tempcode];
6475#ifdef SUPPORT_UTF
6476 if (utf && HAS_EXTRALEN(tempcode[-1]))
6477 tempcode += GET_EXTRALEN(tempcode[-1]);
6478#endif
6479 break;
6480
6481 /* For the class opcodes, the repeat operator appears at the end;
6482 adjust tempcode to point to it. */
6483
6484 case OP_CLASS:
6485 case OP_NCLASS:
6486 tempcode += 1 + 32/sizeof(pcre_uchar);
6487 break;
6488
6489#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6490 case OP_XCLASS:
6491 tempcode += GET(tempcode, 1);
6492 break;
6493#endif
6494 }
6495
6496 /* If tempcode is equal to code (which points to the end of the repeated
6497 item), it means we have skipped an EXACT item but there is no following
6498 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6499 all other cases, tempcode will be pointing to the repeat opcode, and will
6500 be less than code, so the value of len will be greater than 0. */
6501
6502 len = (int)(code - tempcode);
6503 if (len > 0)
6504 {
6505 unsigned int repcode = *tempcode;
6506
6507 /* There is a table for possessifying opcodes, all of which are less
6508 than OP_CALLOUT. A zero entry means there is no possessified version.
6509 */
6510
6511 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6512 *tempcode = opcode_possessify[repcode];
6513
6514 /* For opcode without a special possessified version, wrap the item in
6515 ONCE brackets. Because we are moving code along, we must ensure that any
6516 pending recursive references are updated. */
6517
6518 else
6519 {
6520 *code = OP_END;
6521 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6522 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6523 code += 1 + LINK_SIZE;
6524 len += 1 + LINK_SIZE;
6525 tempcode[0] = OP_ONCE;
6526 *code++ = OP_KET;
6527 PUTINC(code, 0, len);
6528 PUT(tempcode, 1, len);
6529 }
6530 }
6531
6532#ifdef NEVER
6533 if (len > 0) switch (*tempcode)
6534 {
6535 case OP_STAR: *tempcode = OP_POSSTAR; break;
6536 case OP_PLUS: *tempcode = OP_POSPLUS; break;
6537 case OP_QUERY: *tempcode = OP_POSQUERY; break;
6538 case OP_UPTO: *tempcode = OP_POSUPTO; break;
6539
6540 case OP_STARI: *tempcode = OP_POSSTARI; break;
6541 case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
6542 case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6543 case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
6544
6545 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
6546 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
6547 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6548 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
6549
6550 case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
6551 case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
6552 case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6553 case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
6554
6555 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
6556 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
6557 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6558 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
6559
6560 case OP_CRSTAR: *tempcode = OP_CRPOSSTAR; break;
6561 case OP_CRPLUS: *tempcode = OP_CRPOSPLUS; break;
6562 case OP_CRQUERY: *tempcode = OP_CRPOSQUERY; break;
6563 case OP_CRRANGE: *tempcode = OP_CRPOSRANGE; break;
6564
6565 /* Because we are moving code along, we must ensure that any
6566 pending recursive references are updated. */
6567
6568 default:
6569 *code = OP_END;
6570 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6571 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6572 code += 1 + LINK_SIZE;
6573 len += 1 + LINK_SIZE;
6574 tempcode[0] = OP_ONCE;
6575 *code++ = OP_KET;
6576 PUTINC(code, 0, len);
6577 PUT(tempcode, 1, len);
6578 break;
6579 }
6580#endif
6581 }
6582
6583 /* In all case we no longer have a previous item. We also set the
6584 "follows varying string" flag for subsequently encountered reqchars if
6585 it isn't already set and we have just passed a varying length item. */
6586
6587 END_REPEAT:
6588 previous = NULL;
6589 cd->req_varyopt |= reqvary;
6590 break;
6591
6592
6593 /* ===================================================================*/
6594 /* Start of nested parenthesized sub-expression, or comment or lookahead or
6595 lookbehind or option setting or condition or all the other extended
6596 parenthesis forms. */
6597
6598 case CHAR_LEFT_PARENTHESIS:
6599 ptr++;
6600
6601 /* Now deal with various "verbs" that can be introduced by '*'. */
6602
6603 if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6604 || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6605 {
6606 int i, namelen;
6607 int arglen = 0;
6608 const char *vn = verbnames;
6609 const pcre_uchar *name = ptr + 1;
6610 const pcre_uchar *arg = NULL;
6611 previous = NULL;
6612 ptr++;
6613 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6614 namelen = (int)(ptr - name);
6615
6616 /* It appears that Perl allows any characters whatsoever, other than
6617 a closing parenthesis, to appear in arguments, so we no longer insist on
6618 letters, digits, and underscores. */
6619
6620 if (*ptr == CHAR_COLON)
6621 {
6622 arg = ++ptr;
6623 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6624 arglen = (int)(ptr - arg);
6625 if ((unsigned int)arglen > MAX_MARK)
6626 {
6627 *errorcodeptr = ERR75;
6628 goto FAILED;
6629 }
6630 }
6631
6632 if (*ptr != CHAR_RIGHT_PARENTHESIS)
6633 {
6634 *errorcodeptr = ERR60;
6635 goto FAILED;
6636 }
6637
6638 /* Scan the table of verb names */
6639
6640 for (i = 0; i < verbcount; i++)
6641 {
6642 if (namelen == verbs[i].len &&
6643 STRNCMP_UC_C8(name, vn, namelen) == 0)
6644 {
6645 int setverb;
6646
6647 /* Check for open captures before ACCEPT and convert it to
6648 ASSERT_ACCEPT if in an assertion. */
6649
6650 if (verbs[i].op == OP_ACCEPT)
6651 {
6652 open_capitem *oc;
6653 if (arglen != 0)
6654 {
6655 *errorcodeptr = ERR59;
6656 goto FAILED;
6657 }
6658 cd->had_accept = TRUE;
6659 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6660 {
6661 if (lengthptr != NULL)
6662 {
6663#ifdef COMPILE_PCRE8
6664 *lengthptr += 1 + IMM2_SIZE;
6665#elif defined COMPILE_PCRE16
6666 *lengthptr += 2 + IMM2_SIZE;
6667#elif defined COMPILE_PCRE32
6668 *lengthptr += 4 + IMM2_SIZE;
6669#endif
6670 }
6671 else
6672 {
6673 *code++ = OP_CLOSE;
6674 PUT2INC(code, 0, oc->number);
6675 }
6676 }
6677 setverb = *code++ =
6678 (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6679
6680 /* Do not set firstchar after *ACCEPT */
6681 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6682 }
6683
6684 /* Handle other cases with/without an argument */
6685
6686 else if (arglen == 0)
6687 {
6688 if (verbs[i].op < 0) /* Argument is mandatory */
6689 {
6690 *errorcodeptr = ERR66;
6691 goto FAILED;
6692 }
6693 setverb = *code++ = verbs[i].op;
6694 }
6695
6696 else
6697 {
6698 if (verbs[i].op_arg < 0) /* Argument is forbidden */
6699 {
6700 *errorcodeptr = ERR59;
6701 goto FAILED;
6702 }
6703 setverb = *code++ = verbs[i].op_arg;
6704 if (lengthptr != NULL) /* In pass 1 just add in the length */
6705 { /* to avoid potential workspace */
6706 *lengthptr += arglen; /* overflow. */
6707 *code++ = 0;
6708 }
6709 else
6710 {
6711 *code++ = arglen;
6712 memcpy(code, arg, IN_UCHARS(arglen));
6713 code += arglen;
6714 }
6715 *code++ = 0;
6716 }
6717
6718 switch (setverb)
6719 {
6720 case OP_THEN:
6721 case OP_THEN_ARG:
6722 cd->external_flags |= PCRE_HASTHEN;
6723 break;
6724
6725 case OP_PRUNE:
6726 case OP_PRUNE_ARG:
6727 case OP_SKIP:
6728 case OP_SKIP_ARG:
6729 cd->had_pruneorskip = TRUE;
6730 break;
6731 }
6732
6733 break; /* Found verb, exit loop */
6734 }
6735
6736 vn += verbs[i].len + 1;
6737 }
6738
6739 if (i < verbcount) continue; /* Successfully handled a verb */
6740 *errorcodeptr = ERR60; /* Verb not recognized */
6741 goto FAILED;
6742 }
6743
6744 /* Initialize for "real" parentheses */
6745
6746 newoptions = options;
6747 skipbytes = 0;
6748 bravalue = OP_CBRA;
6749 item_hwm_offset = cd->hwm - cd->start_workspace;
6750 reset_bracount = FALSE;
6751
6752 /* Deal with the extended parentheses; all are introduced by '?', and the
6753 appearance of any of them means that this is not a capturing group. */
6754
6755 if (*ptr == CHAR_QUESTION_MARK)
6756 {
6757 int i, set, unset, namelen;
6758 int *optset;
6759 const pcre_uchar *name;
6760 pcre_uchar *slot;
6761
6762 switch (*(++ptr))
6763 {
6764 /* ------------------------------------------------------------ */
6765 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
6766 reset_bracount = TRUE;
6767 cd->dupgroups = TRUE; /* Record (?| encountered */
6768 /* Fall through */
6769
6770 /* ------------------------------------------------------------ */
6771 case CHAR_COLON: /* Non-capturing bracket */
6772 bravalue = OP_BRA;
6773 ptr++;
6774 break;
6775
6776
6777 /* ------------------------------------------------------------ */
6778 case CHAR_LEFT_PARENTHESIS:
6779 bravalue = OP_COND; /* Conditional group */
6780 tempptr = ptr;
6781
6782 /* A condition can be an assertion, a number (referring to a numbered
6783 group's having been set), a name (referring to a named group), or 'R',
6784 referring to recursion. R<digits> and R&name are also permitted for
6785 recursion tests.
6786
6787 There are ways of testing a named group: (?(name)) is used by Python;
6788 Perl 5.10 onwards uses (?(<name>) or (?('name')).
6789
6790 There is one unfortunate ambiguity, caused by history. 'R' can be the
6791 recursive thing or the name 'R' (and similarly for 'R' followed by
6792 digits). We look for a name first; if not found, we try the other case.
6793
6794 For compatibility with auto-callouts, we allow a callout to be
6795 specified before a condition that is an assertion. First, check for the
6796 syntax of a callout; if found, adjust the temporary pointer that is
6797 used to check for an assertion condition. That's all that is needed! */
6798
6799 if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6800 {
6801 for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6802 if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6803 tempptr += i + 1;
6804
6805 /* tempptr should now be pointing to the opening parenthesis of the
6806 assertion condition. */
6807
6808 if (*tempptr != CHAR_LEFT_PARENTHESIS)
6809 {
6810 *errorcodeptr = ERR28;
6811 goto FAILED;
6812 }
6813 }
6814
6815 /* For conditions that are assertions, check the syntax, and then exit
6816 the switch. This will take control down to where bracketed groups,
6817 including assertions, are processed. */
6818
6819 if (tempptr[1] == CHAR_QUESTION_MARK &&
6820 (tempptr[2] == CHAR_EQUALS_SIGN ||
6821 tempptr[2] == CHAR_EXCLAMATION_MARK ||
6822 (tempptr[2] == CHAR_LESS_THAN_SIGN &&
6823 (tempptr[3] == CHAR_EQUALS_SIGN ||
6824 tempptr[3] == CHAR_EXCLAMATION_MARK))))
6825 {
6826 cd->iscondassert = TRUE;
6827 break;
6828 }
6829
6830 /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6831 need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6832
6833 code[1+LINK_SIZE] = OP_CREF;
6834 skipbytes = 1+IMM2_SIZE;
6835 refsign = -1; /* => not a number */
6836 namelen = -1; /* => not a name; must set to avoid warning */
6837 name = NULL; /* Always set to avoid warning */
6838 recno = 0; /* Always set to avoid warning */
6839
6840 /* Check for a test for recursion in a named group. */
6841
6842 ptr++;
6843 if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6844 {
6845 terminator = -1;
6846 ptr += 2;
6847 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
6848 }
6849
6850 /* Check for a test for a named group's having been set, using the Perl
6851 syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6852 syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6853
6854 else if (*ptr == CHAR_LESS_THAN_SIGN)
6855 {
6856 terminator = CHAR_GREATER_THAN_SIGN;
6857 ptr++;
6858 }
6859 else if (*ptr == CHAR_APOSTROPHE)
6860 {
6861 terminator = CHAR_APOSTROPHE;
6862 ptr++;
6863 }
6864 else
6865 {
6866 terminator = CHAR_NULL;
6867 if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6868 else if (IS_DIGIT(*ptr)) refsign = 0;
6869 }
6870
6871 /* Handle a number */
6872
6873 if (refsign >= 0)
6874 {
6875 while (IS_DIGIT(*ptr))
6876 {
6877 if (recno > INT_MAX / 10 - 1) /* Integer overflow */
6878 {
6879 while (IS_DIGIT(*ptr)) ptr++;
6880 *errorcodeptr = ERR61;
6881 goto FAILED;
6882 }
6883 recno = recno * 10 + (int)(*ptr - CHAR_0);
6884 ptr++;
6885 }
6886 }
6887
6888 /* Otherwise we expect to read a name; anything else is an error. When
6889 a name is one of a number of duplicates, a different opcode is used and
6890 it needs more memory. Unfortunately we cannot tell whether a name is a
6891 duplicate in the first pass, so we have to allow for more memory. */
6892
6893 else
6894 {
6895 if (IS_DIGIT(*ptr))
6896 {
6897 *errorcodeptr = ERR84;
6898 goto FAILED;
6899 }
6900 if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6901 {
6902 *errorcodeptr = ERR28; /* Assertion expected */
6903 goto FAILED;
6904 }
6905 name = ptr++;
6906 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6907 {
6908 ptr++;
6909 }
6910 namelen = (int)(ptr - name);
6911 if (lengthptr != NULL) skipbytes += IMM2_SIZE;
6912 }
6913
6914 /* Check the terminator */
6915
6916 if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6917 *ptr++ != CHAR_RIGHT_PARENTHESIS)
6918 {
6919 ptr--; /* Error offset */
6920 *errorcodeptr = ERR26; /* Malformed number or name */
6921 goto FAILED;
6922 }
6923
6924 /* Do no further checking in the pre-compile phase. */
6925
6926 if (lengthptr != NULL) break;
6927
6928 /* In the real compile we do the work of looking for the actual
6929 reference. If refsign is not negative, it means we have a number in
6930 recno. */
6931
6932 if (refsign >= 0)
6933 {
6934 if (recno <= 0)
6935 {
6936 *errorcodeptr = ERR35;
6937 goto FAILED;
6938 }
6939 if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6940 cd->bracount - recno + 1 : recno + cd->bracount;
6941 if (recno <= 0 || recno > cd->final_bracount)
6942 {
6943 *errorcodeptr = ERR15;
6944 goto FAILED;
6945 }
6946 PUT2(code, 2+LINK_SIZE, recno);
6947 if (recno > cd->top_backref) cd->top_backref = recno;
6948 break;
6949 }
6950
6951 /* Otherwise look for the name. */
6952
6953 slot = cd->name_table;
6954 for (i = 0; i < cd->names_found; i++)
6955 {
6956 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6957 slot[IMM2_SIZE+namelen] == 0) break;
6958 slot += cd->name_entry_size;
6959 }
6960
6961 /* Found the named subpattern. If the name is duplicated, add one to
6962 the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6963 appropriate data values. Otherwise, just insert the unique subpattern
6964 number. */
6965
6966 if (i < cd->names_found)
6967 {
6968 int offset = i++;
6969 int count = 1;
6970 recno = GET2(slot, 0); /* Number from first found */
6971 if (recno > cd->top_backref) cd->top_backref = recno;
6972 for (; i < cd->names_found; i++)
6973 {
6974 slot += cd->name_entry_size;
6975 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
6976 (slot+IMM2_SIZE)[namelen] != 0) break;
6977 count++;
6978 }
6979
6980 if (count > 1)
6981 {
6982 PUT2(code, 2+LINK_SIZE, offset);
6983 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6984 skipbytes += IMM2_SIZE;
6985 code[1+LINK_SIZE]++;
6986 }
6987 else /* Not a duplicated name */
6988 {
6989 PUT2(code, 2+LINK_SIZE, recno);
6990 }
6991 }
6992
6993 /* If terminator == CHAR_NULL it means that the name followed directly
6994 after the opening parenthesis [e.g. (?(abc)...] and in this case there
6995 are some further alternatives to try. For the cases where terminator !=
6996 CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
6997 we have now checked all the possibilities, so give an error. */
6998
6999 else if (terminator != CHAR_NULL)
7000 {
7001 *errorcodeptr = ERR15;
7002 goto FAILED;
7003 }
7004
7005 /* Check for (?(R) for recursion. Allow digits after R to specify a
7006 specific group number. */
7007
7008 else if (*name == CHAR_R)
7009 {
7010 recno = 0;
7011 for (i = 1; i < namelen; i++)
7012 {
7013 if (!IS_DIGIT(name[i]))
7014 {
7015 *errorcodeptr = ERR15;
7016 goto FAILED;
7017 }
7018 if (recno > INT_MAX / 10 - 1) /* Integer overflow */
7019 {
7020 *errorcodeptr = ERR61;
7021 goto FAILED;
7022 }
7023 recno = recno * 10 + name[i] - CHAR_0;
7024 }
7025 if (recno == 0) recno = RREF_ANY;
7026 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
7027 PUT2(code, 2+LINK_SIZE, recno);
7028 }
7029
7030 /* Similarly, check for the (?(DEFINE) "condition", which is always
7031 false. */
7032
7033 else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
7034 {
7035 code[1+LINK_SIZE] = OP_DEF;
7036 skipbytes = 1;
7037 }
7038
7039 /* Reference to an unidentified subpattern. */
7040
7041 else
7042 {
7043 *errorcodeptr = ERR15;
7044 goto FAILED;
7045 }
7046 break;
7047
7048
7049 /* ------------------------------------------------------------ */
7050 case CHAR_EQUALS_SIGN: /* Positive lookahead */
7051 bravalue = OP_ASSERT;
7052 cd->assert_depth += 1;
7053 ptr++;
7054 break;
7055
7056 /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
7057 thing to do, but Perl allows all assertions to be quantified, and when
7058 they contain capturing parentheses there may be a potential use for
7059 this feature. Not that that applies to a quantified (?!) but we allow
7060 it for uniformity. */
7061
7062 /* ------------------------------------------------------------ */
7063 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
7064 ptr++;
7065 if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
7066 ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
7067 (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
7068 {
7069 *code++ = OP_FAIL;
7070 previous = NULL;
7071 continue;
7072 }
7073 bravalue = OP_ASSERT_NOT;
7074 cd->assert_depth += 1;
7075 break;
7076
7077
7078 /* ------------------------------------------------------------ */
7079 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
7080 switch (ptr[1])
7081 {
7082 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
7083 bravalue = OP_ASSERTBACK;
7084 cd->assert_depth += 1;
7085 ptr += 2;
7086 break;
7087
7088 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
7089 bravalue = OP_ASSERTBACK_NOT;
7090 cd->assert_depth += 1;
7091 ptr += 2;
7092 break;
7093
7094 default: /* Could be name define, else bad */
7095 if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
7096 goto DEFINE_NAME;
7097 ptr++; /* Correct offset for error */
7098 *errorcodeptr = ERR24;
7099 goto FAILED;
7100 }
7101 break;
7102
7103
7104 /* ------------------------------------------------------------ */
7105 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
7106 bravalue = OP_ONCE;
7107 ptr++;
7108 break;
7109
7110
7111 /* ------------------------------------------------------------ */
7112 case CHAR_C: /* Callout - may be followed by digits; */
7113 previous_callout = code; /* Save for later completion */
7114 after_manual_callout = 1; /* Skip one item before completing */
7115 *code++ = OP_CALLOUT;
7116 {
7117 int n = 0;
7118 ptr++;
7119 while(IS_DIGIT(*ptr))
7120 n = n * 10 + *ptr++ - CHAR_0;
7121 if (*ptr != CHAR_RIGHT_PARENTHESIS)
7122 {
7123 *errorcodeptr = ERR39;
7124 goto FAILED;
7125 }
7126 if (n > 255)
7127 {
7128 *errorcodeptr = ERR38;
7129 goto FAILED;
7130 }
7131 *code++ = n;
7132 PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
7133 PUT(code, LINK_SIZE, 0); /* Default length */
7134 code += 2 * LINK_SIZE;
7135 }
7136 previous = NULL;
7137 continue;
7138
7139
7140 /* ------------------------------------------------------------ */
7141 case CHAR_P: /* Python-style named subpattern handling */
7142 if (*(++ptr) == CHAR_EQUALS_SIGN ||
7143 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
7144 {
7145 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
7146 terminator = CHAR_RIGHT_PARENTHESIS;
7147 goto NAMED_REF_OR_RECURSE;
7148 }
7149 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
7150 {
7151 *errorcodeptr = ERR41;
7152 goto FAILED;
7153 }
7154 /* Fall through to handle (?P< as (?< is handled */
7155
7156
7157 /* ------------------------------------------------------------ */
7158 DEFINE_NAME: /* Come here from (?< handling */
7159 case CHAR_APOSTROPHE:
7160 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
7161 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7162 name = ++ptr;
7163 if (IS_DIGIT(*ptr))
7164 {
7165 *errorcodeptr = ERR84; /* Group name must start with non-digit */
7166 goto FAILED;
7167 }
7168 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7169 namelen = (int)(ptr - name);
7170
7171 /* In the pre-compile phase, do a syntax check, remember the longest
7172 name, and then remember the group in a vector, expanding it if
7173 necessary. Duplicates for the same number are skipped; other duplicates
7174 are checked for validity. In the actual compile, there is nothing to
7175 do. */
7176
7177 if (lengthptr != NULL)
7178 {
7179 named_group *ng;
7180 pcre_uint32 number = cd->bracount + 1;
7181
7182 if (*ptr != (pcre_uchar)terminator)
7183 {
7184 *errorcodeptr = ERR42;
7185 goto FAILED;
7186 }
7187
7188 if (cd->names_found >= MAX_NAME_COUNT)
7189 {
7190 *errorcodeptr = ERR49;
7191 goto FAILED;
7192 }
7193
7194 if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
7195 {
7196 cd->name_entry_size = namelen + IMM2_SIZE + 1;
7197 if (namelen > MAX_NAME_SIZE)
7198 {
7199 *errorcodeptr = ERR48;
7200 goto FAILED;
7201 }
7202 }
7203
7204 /* Scan the list to check for duplicates. For duplicate names, if the
7205 number is the same, break the loop, which causes the name to be
7206 discarded; otherwise, if DUPNAMES is not set, give an error.
7207 If it is set, allow the name with a different number, but continue
7208 scanning in case this is a duplicate with the same number. For
7209 non-duplicate names, give an error if the number is duplicated. */
7210
7211 ng = cd->named_groups;
7212 for (i = 0; i < cd->names_found; i++, ng++)
7213 {
7214 if (namelen == ng->length &&
7215 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7216 {
7217 if (ng->number == number) break;
7218 if ((options & PCRE_DUPNAMES) == 0)
7219 {
7220 *errorcodeptr = ERR43;
7221 goto FAILED;
7222 }
7223 cd->dupnames = TRUE; /* Duplicate names exist */
7224 }
7225 else if (ng->number == number)
7226 {
7227 *errorcodeptr = ERR65;
7228 goto FAILED;
7229 }
7230 }
7231
7232 if (i >= cd->names_found) /* Not a duplicate with same number */
7233 {
7234 /* Increase the list size if necessary */
7235
7236 if (cd->names_found >= cd->named_group_list_size)
7237 {
7238 int newsize = cd->named_group_list_size * 2;
7239 named_group *newspace = (PUBL(malloc))
7240 (newsize * sizeof(named_group));
7241
7242 if (newspace == NULL)
7243 {
7244 *errorcodeptr = ERR21;
7245 goto FAILED;
7246 }
7247
7248 memcpy(newspace, cd->named_groups,
7249 cd->named_group_list_size * sizeof(named_group));
7250 if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
7251 (PUBL(free))((void *)cd->named_groups);
7252 cd->named_groups = newspace;
7253 cd->named_group_list_size = newsize;
7254 }
7255
7256 cd->named_groups[cd->names_found].name = name;
7257 cd->named_groups[cd->names_found].length = namelen;
7258 cd->named_groups[cd->names_found].number = number;
7259 cd->names_found++;
7260 }
7261 }
7262
7263 ptr++; /* Move past > or ' in both passes. */
7264 goto NUMBERED_GROUP;
7265
7266
7267 /* ------------------------------------------------------------ */
7268 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
7269 terminator = CHAR_RIGHT_PARENTHESIS;
7270 is_recurse = TRUE;
7271 /* Fall through */
7272
7273 /* We come here from the Python syntax above that handles both
7274 references (?P=name) and recursion (?P>name), as well as falling
7275 through from the Perl recursion syntax (?&name). We also come here from
7276 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
7277 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
7278
7279 NAMED_REF_OR_RECURSE:
7280 name = ++ptr;
7281 if (IS_DIGIT(*ptr))
7282 {
7283 *errorcodeptr = ERR84; /* Group name must start with non-digit */
7284 goto FAILED;
7285 }
7286 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7287 namelen = (int)(ptr - name);
7288
7289 /* In the pre-compile phase, do a syntax check. We used to just set
7290 a dummy reference number, because it was not used in the first pass.
7291 However, with the change of recursive back references to be atomic,
7292 we have to look for the number so that this state can be identified, as
7293 otherwise the incorrect length is computed. If it's not a backwards
7294 reference, the dummy number will do. */
7295
7296 if (lengthptr != NULL)
7297 {
7298 named_group *ng;
7299 recno = 0;
7300
7301 if (namelen == 0)
7302 {
7303 *errorcodeptr = ERR62;
7304 goto FAILED;
7305 }
7306 if (*ptr != (pcre_uchar)terminator)
7307 {
7308 *errorcodeptr = ERR42;
7309 goto FAILED;
7310 }
7311 if (namelen > MAX_NAME_SIZE)
7312 {
7313 *errorcodeptr = ERR48;
7314 goto FAILED;
7315 }
7316
7317 /* Count named back references. */
7318
7319 if (!is_recurse) cd->namedrefcount++;
7320
7321 /* We have to allow for a named reference to a duplicated name (this
7322 cannot be determined until the second pass). This needs an extra
7323 16-bit data item. */
7324
7325 *lengthptr += IMM2_SIZE;
7326
7327 /* If this is a forward reference and we are within a (?|...) group,
7328 the reference may end up as the number of a group which we are
7329 currently inside, that is, it could be a recursive reference. In the
7330 real compile this will be picked up and the reference wrapped with
7331 OP_ONCE to make it atomic, so we must space in case this occurs. */
7332
7333 /* In fact, this can happen for a non-forward reference because
7334 another group with the same number might be created later. This
7335 issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
7336 only mode, we finesse the bug by allowing more memory always. */
7337
7338 *lengthptr += 4 + 4*LINK_SIZE;
7339
7340 /* It is even worse than that. The current reference may be to an
7341 existing named group with a different number (so apparently not
7342 recursive) but which later on is also attached to a group with the
7343 current number. This can only happen if $(| has been previous
7344 encountered. In that case, we allow yet more memory, just in case.
7345 (Again, this is fixed "properly" in PCRE2. */
7346
7347 if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE;
7348
7349 /* Otherwise, check for recursion here. The name table does not exist
7350 in the first pass; instead we must scan the list of names encountered
7351 so far in order to get the number. If the name is not found, leave
7352 the value of recno as 0 for a forward reference. */
7353
7354 /* This patch (removing "else") fixes a problem when a reference is
7355 to multiple identically named nested groups from within the nest.
7356 Once again, it is not the "proper" fix, and it results in an
7357 over-allocation of memory. */
7358
7359 /* else */
7360 {
7361 ng = cd->named_groups;
7362 for (i = 0; i < cd->names_found; i++, ng++)
7363 {
7364 if (namelen == ng->length &&
7365 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7366 {
7367 open_capitem *oc;
7368 recno = ng->number;
7369 if (is_recurse) break;
7370 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7371 {
7372 if (oc->number == recno)
7373 {
7374 oc->flag = TRUE;
7375 break;
7376 }
7377 }
7378 }
7379 }
7380 }
7381 }
7382
7383 /* In the real compile, search the name table. We check the name
7384 first, and then check that we have reached the end of the name in the
7385 table. That way, if the name is longer than any in the table, the
7386 comparison will fail without reading beyond the table entry. */
7387
7388 else
7389 {
7390 slot = cd->name_table;
7391 for (i = 0; i < cd->names_found; i++)
7392 {
7393 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
7394 slot[IMM2_SIZE+namelen] == 0)
7395 break;
7396 slot += cd->name_entry_size;
7397 }
7398
7399 if (i < cd->names_found)
7400 {
7401 recno = GET2(slot, 0);
7402 }
7403 else
7404 {
7405 *errorcodeptr = ERR15;
7406 goto FAILED;
7407 }
7408 }
7409
7410 /* In both phases, for recursions, we can now go to the code than
7411 handles numerical recursion. */
7412
7413 if (is_recurse) goto HANDLE_RECURSION;
7414
7415 /* In the second pass we must see if the name is duplicated. If so, we
7416 generate a different opcode. */
7417
7418 if (lengthptr == NULL && cd->dupnames)
7419 {
7420 int count = 1;
7421 unsigned int index = i;
7422 pcre_uchar *cslot = slot + cd->name_entry_size;
7423
7424 for (i++; i < cd->names_found; i++)
7425 {
7426 if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
7427 count++;
7428 cslot += cd->name_entry_size;
7429 }
7430
7431 if (count > 1)
7432 {
7433 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7434 previous = code;
7435 item_hwm_offset = cd->hwm - cd->start_workspace;
7436 *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7437 PUT2INC(code, 0, index);
7438 PUT2INC(code, 0, count);
7439
7440 /* Process each potentially referenced group. */
7441
7442 for (; slot < cslot; slot += cd->name_entry_size)
7443 {
7444 open_capitem *oc;
7445 recno = GET2(slot, 0);
7446 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
7447 if (recno > cd->top_backref) cd->top_backref = recno;
7448
7449 /* Check to see if this back reference is recursive, that it, it
7450 is inside the group that it references. A flag is set so that the
7451 group can be made atomic. */
7452
7453 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7454 {
7455 if (oc->number == recno)
7456 {
7457 oc->flag = TRUE;
7458 break;
7459 }
7460 }
7461 }
7462
7463 continue; /* End of back ref handling */
7464 }
7465 }
7466
7467 /* First pass, or a non-duplicated name. */
7468
7469 goto HANDLE_REFERENCE;
7470
7471
7472 /* ------------------------------------------------------------ */
7473 case CHAR_R: /* Recursion, same as (?0) */
7474 recno = 0;
7475 if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
7476 {
7477 *errorcodeptr = ERR29;
7478 goto FAILED;
7479 }
7480 goto HANDLE_RECURSION;
7481
7482
7483 /* ------------------------------------------------------------ */
7484 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
7485 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
7486 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
7487 {
7488 const pcre_uchar *called;
7489 terminator = CHAR_RIGHT_PARENTHESIS;
7490
7491 /* Come here from the \g<...> and \g'...' code (Oniguruma
7492 compatibility). However, the syntax has been checked to ensure that
7493 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
7494 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
7495 ever be taken. */
7496
7497 HANDLE_NUMERICAL_RECURSION:
7498
7499 if ((refsign = *ptr) == CHAR_PLUS)
7500 {
7501 ptr++;
7502 if (!IS_DIGIT(*ptr))
7503 {
7504 *errorcodeptr = ERR63;
7505 goto FAILED;
7506 }
7507 }
7508 else if (refsign == CHAR_MINUS)
7509 {
7510 if (!IS_DIGIT(ptr[1]))
7511 goto OTHER_CHAR_AFTER_QUERY;
7512 ptr++;
7513 }
7514
7515 recno = 0;
7516 while(IS_DIGIT(*ptr))
7517 {
7518 if (recno > INT_MAX / 10 - 1) /* Integer overflow */
7519 {
7520 while (IS_DIGIT(*ptr)) ptr++;
7521 *errorcodeptr = ERR61;
7522 goto FAILED;
7523 }
7524 recno = recno * 10 + *ptr++ - CHAR_0;
7525 }
7526
7527 if (*ptr != (pcre_uchar)terminator)
7528 {
7529 *errorcodeptr = ERR29;
7530 goto FAILED;
7531 }
7532
7533 if (refsign == CHAR_MINUS)
7534 {
7535 if (recno == 0)
7536 {
7537 *errorcodeptr = ERR58;
7538 goto FAILED;
7539 }
7540 recno = cd->bracount - recno + 1;
7541 if (recno <= 0)
7542 {
7543 *errorcodeptr = ERR15;
7544 goto FAILED;
7545 }
7546 }
7547 else if (refsign == CHAR_PLUS)
7548 {
7549 if (recno == 0)
7550 {
7551 *errorcodeptr = ERR58;
7552 goto FAILED;
7553 }
7554 recno += cd->bracount;
7555 }
7556
7557 /* Come here from code above that handles a named recursion */
7558
7559 HANDLE_RECURSION:
7560
7561 previous = code;
7562 item_hwm_offset = cd->hwm - cd->start_workspace;
7563 called = cd->start_code;
7564
7565 /* When we are actually compiling, find the bracket that is being
7566 referenced. Temporarily end the regex in case it doesn't exist before
7567 this point. If we end up with a forward reference, first check that
7568 the bracket does occur later so we can give the error (and position)
7569 now. Then remember this forward reference in the workspace so it can
7570 be filled in at the end. */
7571
7572 if (lengthptr == NULL)
7573 {
7574 *code = OP_END;
7575 if (recno != 0)
7576 called = PRIV(find_bracket)(cd->start_code, utf, recno);
7577
7578 /* Forward reference */
7579
7580 if (called == NULL)
7581 {
7582 if (recno > cd->final_bracount)
7583 {
7584 *errorcodeptr = ERR15;
7585 goto FAILED;
7586 }
7587
7588 /* Fudge the value of "called" so that when it is inserted as an
7589 offset below, what it actually inserted is the reference number
7590 of the group. Then remember the forward reference. */
7591
7592 called = cd->start_code + recno;
7593 if (cd->hwm >= cd->start_workspace + cd->workspace_size -
7594 WORK_SIZE_SAFETY_MARGIN)
7595 {
7596 *errorcodeptr = expand_workspace(cd);
7597 if (*errorcodeptr != 0) goto FAILED;
7598 }
7599 PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
7600 }
7601
7602 /* If not a forward reference, and the subpattern is still open,
7603 this is a recursive call. We check to see if this is a left
7604 recursion that could loop for ever, and diagnose that case. We
7605 must not, however, do this check if we are in a conditional
7606 subpattern because the condition might be testing for recursion in
7607 a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
7608 Forever loops are also detected at runtime, so those that occur in
7609 conditional subpatterns will be picked up then. */
7610
7611 else if (GET(called, 1) == 0 && cond_depth <= 0 &&
7612 could_be_empty(called, code, bcptr, utf, cd))
7613 {
7614 *errorcodeptr = ERR40;
7615 goto FAILED;
7616 }
7617 }
7618
7619 /* Insert the recursion/subroutine item. It does not have a set first
7620 character (relevant if it is repeated, because it will then be
7621 wrapped with ONCE brackets). */
7622
7623 *code = OP_RECURSE;
7624 PUT(code, 1, (int)(called - cd->start_code));
7625 code += 1 + LINK_SIZE;
7626 groupsetfirstchar = FALSE;
7627 }
7628
7629 /* Can't determine a first byte now */
7630
7631 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7632 continue;
7633
7634
7635 /* ------------------------------------------------------------ */
7636 default: /* Other characters: check option setting */
7637 OTHER_CHAR_AFTER_QUERY:
7638 set = unset = 0;
7639 optset = &set;
7640
7641 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
7642 {
7643 switch (*ptr++)
7644 {
7645 case CHAR_MINUS: optset = &unset; break;
7646
7647 case CHAR_J: /* Record that it changed in the external options */
7648 *optset |= PCRE_DUPNAMES;
7649 cd->external_flags |= PCRE_JCHANGED;
7650 break;
7651
7652 case CHAR_i: *optset |= PCRE_CASELESS; break;
7653 case CHAR_m: *optset |= PCRE_MULTILINE; break;
7654 case CHAR_s: *optset |= PCRE_DOTALL; break;
7655 case CHAR_x: *optset |= PCRE_EXTENDED; break;
7656 case CHAR_U: *optset |= PCRE_UNGREEDY; break;
7657 case CHAR_X: *optset |= PCRE_EXTRA; break;
7658
7659 default: *errorcodeptr = ERR12;
7660 ptr--; /* Correct the offset */
7661 goto FAILED;
7662 }
7663 }
7664
7665 /* Set up the changed option bits, but don't change anything yet. */
7666
7667 newoptions = (options | set) & (~unset);
7668
7669 /* If the options ended with ')' this is not the start of a nested
7670 group with option changes, so the options change at this level.
7671 If we are not at the pattern start, reset the greedy defaults and the
7672 case value for firstchar and reqchar. */
7673
7674 if (*ptr == CHAR_RIGHT_PARENTHESIS)
7675 {
7676 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
7677 greedy_non_default = greedy_default ^ 1;
7678 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
7679
7680 /* Change options at this level, and pass them back for use
7681 in subsequent branches. */
7682
7683 *optionsptr = options = newoptions;
7684 previous = NULL; /* This item can't be repeated */
7685 continue; /* It is complete */
7686 }
7687
7688 /* If the options ended with ':' we are heading into a nested group
7689 with possible change of options. Such groups are non-capturing and are
7690 not assertions of any kind. All we need to do is skip over the ':';
7691 the newoptions value is handled below. */
7692
7693 bravalue = OP_BRA;
7694 ptr++;
7695 } /* End of switch for character following (? */
7696 } /* End of (? handling */
7697
7698 /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
7699 is set, all unadorned brackets become non-capturing and behave like (?:...)
7700 brackets. */
7701
7702 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
7703 {
7704 bravalue = OP_BRA;
7705 }
7706
7707 /* Else we have a capturing group. */
7708
7709 else
7710 {
7711 NUMBERED_GROUP:
7712 cd->bracount += 1;
7713 PUT2(code, 1+LINK_SIZE, cd->bracount);
7714 skipbytes = IMM2_SIZE;
7715 }
7716
7717 /* Process nested bracketed regex. First check for parentheses nested too
7718 deeply. */
7719
7720 if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7721 {
7722 *errorcodeptr = ERR82;
7723 goto FAILED;
7724 }
7725
7726 /* All assertions used not to be repeatable, but this was changed for Perl
7727 compatibility. All kinds can now be repeated except for assertions that are
7728 conditions (Perl also forbids these to be repeated). We copy code into a
7729 non-register variable (tempcode) in order to be able to pass its address
7730 because some compilers complain otherwise. At the start of a conditional
7731 group whose condition is an assertion, cd->iscondassert is set. We unset it
7732 here so as to allow assertions later in the group to be quantified. */
7733
7734 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
7735 cd->iscondassert)
7736 {
7737 previous = NULL;
7738 cd->iscondassert = FALSE;
7739 }
7740 else
7741 {
7742 previous = code;
7743 item_hwm_offset = cd->hwm - cd->start_workspace;
7744 }
7745
7746 *code = bravalue;
7747 tempcode = code;
7748 tempreqvary = cd->req_varyopt; /* Save value before bracket */
7749 tempbracount = cd->bracount; /* Save value before bracket */
7750 length_prevgroup = 0; /* Initialize for pre-compile phase */
7751
7752 if (!compile_regex(
7753 newoptions, /* The complete new option state */
7754 &tempcode, /* Where to put code (updated) */
7755 &ptr, /* Input pointer (updated) */
7756 errorcodeptr, /* Where to put an error message */
7757 (bravalue == OP_ASSERTBACK ||
7758 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
7759 reset_bracount, /* True if (?| group */
7760 skipbytes, /* Skip over bracket number */
7761 cond_depth +
7762 ((bravalue == OP_COND)?1:0), /* Depth of condition subpatterns */
7763 &subfirstchar, /* For possible first char */
7764 &subfirstcharflags,
7765 &subreqchar, /* For possible last char */
7766 &subreqcharflags,
7767 bcptr, /* Current branch chain */
7768 cd, /* Tables block */
7769 (lengthptr == NULL)? NULL : /* Actual compile phase */
7770 &length_prevgroup /* Pre-compile phase */
7771 ))
7772 goto FAILED;
7773
7774 cd->parens_depth -= 1;
7775
7776 /* If this was an atomic group and there are no capturing groups within it,
7777 generate OP_ONCE_NC instead of OP_ONCE. */
7778
7779 if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
7780 *code = OP_ONCE_NC;
7781
7782 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
7783 cd->assert_depth -= 1;
7784
7785 /* At the end of compiling, code is still pointing to the start of the
7786 group, while tempcode has been updated to point past the end of the group.
7787 The pattern pointer (ptr) is on the bracket.
7788
7789 If this is a conditional bracket, check that there are no more than
7790 two branches in the group, or just one if it's a DEFINE group. We do this
7791 in the real compile phase, not in the pre-pass, where the whole group may
7792 not be available. */
7793
7794 if (bravalue == OP_COND && lengthptr == NULL)
7795 {
7796 pcre_uchar *tc = code;
7797 int condcount = 0;
7798
7799 do {
7800 condcount++;
7801 tc += GET(tc,1);
7802 }
7803 while (*tc != OP_KET);
7804
7805 /* A DEFINE group is never obeyed inline (the "condition" is always
7806 false). It must have only one branch. */
7807
7808 if (code[LINK_SIZE+1] == OP_DEF)
7809 {
7810 if (condcount > 1)
7811 {
7812 *errorcodeptr = ERR54;
7813 goto FAILED;
7814 }
7815 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
7816 }
7817
7818 /* A "normal" conditional group. If there is just one branch, we must not
7819 make use of its firstchar or reqchar, because this is equivalent to an
7820 empty second branch. */
7821
7822 else
7823 {
7824 if (condcount > 2)
7825 {
7826 *errorcodeptr = ERR27;
7827 goto FAILED;
7828 }
7829 if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
7830 }
7831 }
7832
7833 /* Error if hit end of pattern */
7834
7835 if (*ptr != CHAR_RIGHT_PARENTHESIS)
7836 {
7837 *errorcodeptr = ERR14;
7838 goto FAILED;
7839 }
7840
7841 /* In the pre-compile phase, update the length by the length of the group,
7842 less the brackets at either end. Then reduce the compiled code to just a
7843 set of non-capturing brackets so that it doesn't use much memory if it is
7844 duplicated by a quantifier.*/
7845
7846 if (lengthptr != NULL)
7847 {
7848 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7849 {
7850 *errorcodeptr = ERR20;
7851 goto FAILED;
7852 }
7853 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7854 code++; /* This already contains bravalue */
7855 PUTINC(code, 0, 1 + LINK_SIZE);
7856 *code++ = OP_KET;
7857 PUTINC(code, 0, 1 + LINK_SIZE);
7858 break; /* No need to waste time with special character handling */
7859 }
7860
7861 /* Otherwise update the main code pointer to the end of the group. */
7862
7863 code = tempcode;
7864
7865 /* For a DEFINE group, required and first character settings are not
7866 relevant. */
7867
7868 if (bravalue == OP_DEF) break;
7869
7870 /* Handle updating of the required and first characters for other types of
7871 group. Update for normal brackets of all kinds, and conditions with two
7872 branches (see code above). If the bracket is followed by a quantifier with
7873 zero repeat, we have to back off. Hence the definition of zeroreqchar and
7874 zerofirstchar outside the main loop so that they can be accessed for the
7875 back off. */
7876
7877 zeroreqchar = reqchar;
7878 zeroreqcharflags = reqcharflags;
7879 zerofirstchar = firstchar;
7880 zerofirstcharflags = firstcharflags;
7881 groupsetfirstchar = FALSE;
7882
7883 if (bravalue >= OP_ONCE)
7884 {
7885 /* If we have not yet set a firstchar in this branch, take it from the
7886 subpattern, remembering that it was set here so that a repeat of more
7887 than one can replicate it as reqchar if necessary. If the subpattern has
7888 no firstchar, set "none" for the whole branch. In both cases, a zero
7889 repeat forces firstchar to "none". */
7890
7891 if (firstcharflags == REQ_UNSET)
7892 {
7893 if (subfirstcharflags >= 0)
7894 {
7895 firstchar = subfirstchar;
7896 firstcharflags = subfirstcharflags;
7897 groupsetfirstchar = TRUE;
7898 }
7899 else firstcharflags = REQ_NONE;
7900 zerofirstcharflags = REQ_NONE;
7901 }
7902
7903 /* If firstchar was previously set, convert the subpattern's firstchar
7904 into reqchar if there wasn't one, using the vary flag that was in
7905 existence beforehand. */
7906
7907 else if (subfirstcharflags >= 0 && subreqcharflags < 0)
7908 {
7909 subreqchar = subfirstchar;
7910 subreqcharflags = subfirstcharflags | tempreqvary;
7911 }
7912
7913 /* If the subpattern set a required byte (or set a first byte that isn't
7914 really the first byte - see above), set it. */
7915
7916 if (subreqcharflags >= 0)
7917 {
7918 reqchar = subreqchar;
7919 reqcharflags = subreqcharflags;
7920 }
7921 }
7922
7923 /* For a forward assertion, we take the reqchar, if set, provided that the
7924 group has also set a first char. This can be helpful if the pattern that
7925 follows the assertion doesn't set a different char. For example, it's
7926 useful for /(?=abcde).+/. We can't set firstchar for an assertion, however
7927 because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7928 the "real" "a" would then become a reqchar instead of a firstchar. This is
7929 overcome by a scan at the end if there's no firstchar, looking for an
7930 asserted first char. */
7931
7932 else if (bravalue == OP_ASSERT && subreqcharflags >= 0 &&
7933 subfirstcharflags >= 0)
7934 {
7935 reqchar = subreqchar;
7936 reqcharflags = subreqcharflags;
7937 }
7938 break; /* End of processing '(' */
7939
7940
7941 /* ===================================================================*/
7942 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
7943 are arranged to be the negation of the corresponding OP_values in the
7944 default case when PCRE_UCP is not set. For the back references, the values
7945 are negative the reference number. Only back references and those types
7946 that consume a character may be repeated. We can test for values between
7947 ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7948 ever created. */
7949
7950 case CHAR_BACKSLASH:
7951 tempptr = ptr;
7952 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
7953 if (*errorcodeptr != 0) goto FAILED;
7954
7955 if (escape == 0) /* The escape coded a single character */
7956 c = ec;
7957 else
7958 {
7959 /* For metasequences that actually match a character, we disable the
7960 setting of a first character if it hasn't already been set. */
7961
7962 if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
7963 firstcharflags = REQ_NONE;
7964
7965 /* Set values to reset to if this is followed by a zero repeat. */
7966
7967 zerofirstchar = firstchar;
7968 zerofirstcharflags = firstcharflags;
7969 zeroreqchar = reqchar;
7970 zeroreqcharflags = reqcharflags;
7971
7972 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
7973 is a subroutine call by number (Oniguruma syntax). In fact, the value
7974 ESC_g is returned only for these cases. So we don't need to check for <
7975 or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7976 -n, and for the Perl syntax \g{name} the result is ESC_k (as
7977 that is a synonym for a named back reference). */
7978
7979 if (escape == ESC_g)
7980 {
7981 const pcre_uchar *p;
7982 pcre_uint32 cf;
7983
7984 item_hwm_offset = cd->hwm - cd->start_workspace; /* Normally this is set when '(' is read */
7985 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7986 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7987
7988 /* These two statements stop the compiler for warning about possibly
7989 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
7990 fact, because we do the check for a number below, the paths that
7991 would actually be in error are never taken. */
7992
7993 skipbytes = 0;
7994 reset_bracount = FALSE;
7995
7996 /* If it's not a signed or unsigned number, treat it as a name. */
7997
7998 cf = ptr[1];
7999 if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
8000 {
8001 is_recurse = TRUE;
8002 goto NAMED_REF_OR_RECURSE;
8003 }
8004
8005 /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
8006 or a digit. */
8007
8008 p = ptr + 2;
8009 while (IS_DIGIT(*p)) p++;
8010 if (*p != (pcre_uchar)terminator)
8011 {
8012 *errorcodeptr = ERR57;
8013 goto FAILED;
8014 }
8015 ptr++;
8016 goto HANDLE_NUMERICAL_RECURSION;
8017 }
8018
8019 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
8020 We also support \k{name} (.NET syntax). */
8021
8022 if (escape == ESC_k)
8023 {
8024 if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
8025 ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
8026 {
8027 *errorcodeptr = ERR69;
8028 goto FAILED;
8029 }
8030 is_recurse = FALSE;
8031 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
8032 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
8033 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
8034 goto NAMED_REF_OR_RECURSE;
8035 }
8036
8037 /* Back references are handled specially; must disable firstchar if
8038 not set to cope with cases like (?=(\w+))\1: which would otherwise set
8039 ':' later. */
8040
8041 if (escape < 0)
8042 {
8043 open_capitem *oc;
8044 recno = -escape;
8045
8046 /* Come here from named backref handling when the reference is to a
8047 single group (i.e. not to a duplicated name. */
8048
8049 HANDLE_REFERENCE:
8050 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
8051 previous = code;
8052 item_hwm_offset = cd->hwm - cd->start_workspace;
8053 *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
8054 PUT2INC(code, 0, recno);
8055 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
8056 if (recno > cd->top_backref) cd->top_backref = recno;
8057
8058 /* Check to see if this back reference is recursive, that it, it
8059 is inside the group that it references. A flag is set so that the
8060 group can be made atomic. */
8061
8062 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
8063 {
8064 if (oc->number == recno)
8065 {
8066 oc->flag = TRUE;
8067 break;
8068 }
8069 }
8070 }
8071
8072 /* So are Unicode property matches, if supported. */
8073
8074#ifdef SUPPORT_UCP
8075 else if (escape == ESC_P || escape == ESC_p)
8076 {
8077 BOOL negated;
8078 unsigned int ptype = 0, pdata = 0;
8079 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
8080 goto FAILED;
8081 previous = code;
8082 item_hwm_offset = cd->hwm - cd->start_workspace;
8083 *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
8084 *code++ = ptype;
8085 *code++ = pdata;
8086 }
8087#else
8088
8089 /* If Unicode properties are not supported, \X, \P, and \p are not
8090 allowed. */
8091
8092 else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
8093 {
8094 *errorcodeptr = ERR45;
8095 goto FAILED;
8096 }
8097#endif
8098
8099 /* For the rest (including \X when Unicode properties are supported), we
8100 can obtain the OP value by negating the escape value in the default
8101 situation when PCRE_UCP is not set. When it *is* set, we substitute
8102 Unicode property tests. Note that \b and \B do a one-character
8103 lookbehind, and \A also behaves as if it does. */
8104
8105 else
8106 {
8107 if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
8108 cd->max_lookbehind == 0)
8109 cd->max_lookbehind = 1;
8110#ifdef SUPPORT_UCP
8111 if (escape >= ESC_DU && escape <= ESC_wu)
8112 {
8113 nestptr = ptr + 1; /* Where to resume */
8114 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
8115 }
8116 else
8117#endif
8118 /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
8119 so that it works in DFA mode and in lookbehinds. */
8120
8121 {
8122 previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
8123 item_hwm_offset = cd->hwm - cd->start_workspace;
8124 *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
8125 }
8126 }
8127 continue;
8128 }
8129
8130 /* We have a data character whose value is in c. In UTF-8 mode it may have
8131 a value > 127. We set its representation in the length/buffer, and then
8132 handle it as a data character. */
8133
8134#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
8135 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
8136 mclength = PRIV(ord2utf)(c, mcbuffer);
8137 else
8138#endif
8139
8140 {
8141 mcbuffer[0] = c;
8142 mclength = 1;
8143 }
8144 goto ONE_CHAR;
8145
8146
8147 /* ===================================================================*/
8148 /* Handle a literal character. It is guaranteed not to be whitespace or #
8149 when the extended flag is set. If we are in a UTF mode, it may be a
8150 multi-unit literal character. */
8151
8152 default:
8153 NORMAL_CHAR:
8154 mclength = 1;
8155 mcbuffer[0] = c;
8156
8157#ifdef SUPPORT_UTF
8158 if (utf && HAS_EXTRALEN(c))
8159 ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
8160#endif
8161
8162 /* At this point we have the character's bytes in mcbuffer, and the length
8163 in mclength. When not in UTF-8 mode, the length is always 1. */
8164
8165 ONE_CHAR:
8166 previous = code;
8167 item_hwm_offset = cd->hwm - cd->start_workspace;
8168
8169 /* For caseless UTF-8 mode when UCP support is available, check whether
8170 this character has more than one other case. If so, generate a special
8171 OP_PROP item instead of OP_CHARI. */
8172
8173#ifdef SUPPORT_UCP
8174 if (utf && (options & PCRE_CASELESS) != 0)
8175 {
8176 GETCHAR(c, mcbuffer);
8177 if ((c = UCD_CASESET(c)) != 0)
8178 {
8179 *code++ = OP_PROP;
8180 *code++ = PT_CLIST;
8181 *code++ = c;
8182 if (firstcharflags == REQ_UNSET)
8183 firstcharflags = zerofirstcharflags = REQ_NONE;
8184 break;
8185 }
8186 }
8187#endif
8188
8189 /* Caseful matches, or not one of the multicase characters. */
8190
8191 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8192 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
8193
8194 /* Remember if \r or \n were seen */
8195
8196 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8197 cd->external_flags |= PCRE_HASCRORLF;
8198
8199 /* Set the first and required bytes appropriately. If no previous first
8200 byte, set it from this character, but revert to none on a zero repeat.
8201 Otherwise, leave the firstchar value alone, and don't change it on a zero
8202 repeat. */
8203
8204 if (firstcharflags == REQ_UNSET)
8205 {
8206 zerofirstcharflags = REQ_NONE;
8207 zeroreqchar = reqchar;
8208 zeroreqcharflags = reqcharflags;
8209
8210 /* If the character is more than one byte long, we can set firstchar
8211 only if it is not to be matched caselessly. */
8212
8213 if (mclength == 1 || req_caseopt == 0)
8214 {
8215 firstchar = mcbuffer[0] | req_caseopt;
8216 firstchar = mcbuffer[0];
8217 firstcharflags = req_caseopt;
8218
8219 if (mclength != 1)
8220 {
8221 reqchar = code[-1];
8222 reqcharflags = cd->req_varyopt;
8223 }
8224 }
8225 else firstcharflags = reqcharflags = REQ_NONE;
8226 }
8227
8228 /* firstchar was previously set; we can set reqchar only if the length is
8229 1 or the matching is caseful. */
8230
8231 else
8232 {
8233 zerofirstchar = firstchar;
8234 zerofirstcharflags = firstcharflags;
8235 zeroreqchar = reqchar;
8236 zeroreqcharflags = reqcharflags;
8237 if (mclength == 1 || req_caseopt == 0)
8238 {
8239 reqchar = code[-1];
8240 reqcharflags = req_caseopt | cd->req_varyopt;
8241 }
8242 }
8243
8244 break; /* End of literal character handling */
8245 }
8246 } /* end of big loop */
8247
8248
8249/* Control never reaches here by falling through, only by a goto for all the
8250error states. Pass back the position in the pattern so that it can be displayed
8251to the user for diagnosing the error. */
8252
8253FAILED:
8254*ptrptr = ptr;
8255return FALSE;
8256}
8257
8258
8259
8260/*************************************************
8261* Compile sequence of alternatives *
8262*************************************************/
8263
8264/* On entry, ptr is pointing past the bracket character, but on return it
8265points to the closing bracket, or vertical bar, or end of string. The code
8266variable is pointing at the byte into which the BRA operator has been stored.
8267This function is used during the pre-compile phase when we are trying to find
8268out the amount of memory needed, as well as during the real compile phase. The
8269value of lengthptr distinguishes the two phases.
8270
8271Arguments:
8272 options option bits, including any changes for this subpattern
8273 codeptr -> the address of the current code pointer
8274 ptrptr -> the address of the current pattern pointer
8275 errorcodeptr -> pointer to error code variable
8276 lookbehind TRUE if this is a lookbehind assertion
8277 reset_bracount TRUE to reset the count for each branch
8278 skipbytes skip this many bytes at start (for brackets and OP_COND)
8279 cond_depth depth of nesting for conditional subpatterns
8280 firstcharptr place to put the first required character
8281 firstcharflagsptr place to put the first character flags, or a negative number
8282 reqcharptr place to put the last required character
8283 reqcharflagsptr place to put the last required character flags, or a negative number
8284 bcptr pointer to the chain of currently open branches
8285 cd points to the data block with tables pointers etc.
8286 lengthptr NULL during the real compile phase
8287 points to length accumulator during pre-compile phase
8288
8289Returns: TRUE on success
8290*/
8291
8292static BOOL
8293compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
8294 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
8295 int cond_depth,
8296 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
8297 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
8298 branch_chain *bcptr, compile_data *cd, int *lengthptr)
8299{
8300const pcre_uchar *ptr = *ptrptr;
8301pcre_uchar *code = *codeptr;
8302pcre_uchar *last_branch = code;
8303pcre_uchar *start_bracket = code;
8304pcre_uchar *reverse_count = NULL;
8305open_capitem capitem;
8306int capnumber = 0;
8307pcre_uint32 firstchar, reqchar;
8308pcre_int32 firstcharflags, reqcharflags;
8309pcre_uint32 branchfirstchar, branchreqchar;
8310pcre_int32 branchfirstcharflags, branchreqcharflags;
8311int length;
8312unsigned int orig_bracount;
8313unsigned int max_bracount;
8314branch_chain bc;
8315size_t save_hwm_offset;
8316
8317/* If set, call the external function that checks for stack availability. */
8318
8319if (PUBL(stack_guard) != NULL && PUBL(stack_guard)())
8320 {
8321 *errorcodeptr= ERR85;
8322 return FALSE;
8323 }
8324
8325/* Miscellaneous initialization */
8326
8327bc.outer = bcptr;
8328bc.current_branch = code;
8329
8330firstchar = reqchar = 0;
8331firstcharflags = reqcharflags = REQ_UNSET;
8332
8333save_hwm_offset = cd->hwm - cd->start_workspace;
8334
8335/* Accumulate the length for use in the pre-compile phase. Start with the
8336length of the BRA and KET and any extra bytes that are required at the
8337beginning. We accumulate in a local variable to save frequent testing of
8338lenthptr for NULL. We cannot do this by looking at the value of code at the
8339start and end of each alternative, because compiled items are discarded during
8340the pre-compile phase so that the work space is not exceeded. */
8341
8342length = 2 + 2*LINK_SIZE + skipbytes;
8343
8344/* WARNING: If the above line is changed for any reason, you must also change
8345the code that abstracts option settings at the start of the pattern and makes
8346them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
8347pre-compile phase to find out whether anything has yet been compiled or not. */
8348
8349/* If this is a capturing subpattern, add to the chain of open capturing items
8350so that we can detect them if (*ACCEPT) is encountered. This is also used to
8351detect groups that contain recursive back references to themselves. Note that
8352only OP_CBRA need be tested here; changing this opcode to one of its variants,
8353e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
8354
8355if (*code == OP_CBRA)
8356 {
8357 capnumber = GET2(code, 1 + LINK_SIZE);
8358 capitem.number = capnumber;
8359 capitem.next = cd->open_caps;
8360 capitem.flag = FALSE;
8361 cd->open_caps = &capitem;
8362 }
8363
8364/* Offset is set zero to mark that this bracket is still open */
8365
8366PUT(code, 1, 0);
8367code += 1 + LINK_SIZE + skipbytes;
8368
8369/* Loop for each alternative branch */
8370
8371orig_bracount = max_bracount = cd->bracount;
8372for (;;)
8373 {
8374 /* For a (?| group, reset the capturing bracket count so that each branch
8375 uses the same numbers. */
8376
8377 if (reset_bracount) cd->bracount = orig_bracount;
8378
8379 /* Set up dummy OP_REVERSE if lookbehind assertion */
8380
8381 if (lookbehind)
8382 {
8383 *code++ = OP_REVERSE;
8384 reverse_count = code;
8385 PUTINC(code, 0, 0);
8386 length += 1 + LINK_SIZE;
8387 }
8388
8389 /* Now compile the branch; in the pre-compile phase its length gets added
8390 into the length. */
8391
8392 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
8393 &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
8394 cond_depth, cd, (lengthptr == NULL)? NULL : &length))
8395 {
8396 *ptrptr = ptr;
8397 return FALSE;
8398 }
8399
8400 /* Keep the highest bracket count in case (?| was used and some branch
8401 has fewer than the rest. */
8402
8403 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
8404
8405 /* In the real compile phase, there is some post-processing to be done. */
8406
8407 if (lengthptr == NULL)
8408 {
8409 /* If this is the first branch, the firstchar and reqchar values for the
8410 branch become the values for the regex. */
8411
8412 if (*last_branch != OP_ALT)
8413 {
8414 firstchar = branchfirstchar;
8415 firstcharflags = branchfirstcharflags;
8416 reqchar = branchreqchar;
8417 reqcharflags = branchreqcharflags;
8418 }
8419
8420 /* If this is not the first branch, the first char and reqchar have to
8421 match the values from all the previous branches, except that if the
8422 previous value for reqchar didn't have REQ_VARY set, it can still match,
8423 and we set REQ_VARY for the regex. */
8424
8425 else
8426 {
8427 /* If we previously had a firstchar, but it doesn't match the new branch,
8428 we have to abandon the firstchar for the regex, but if there was
8429 previously no reqchar, it takes on the value of the old firstchar. */
8430
8431 if (firstcharflags >= 0 &&
8432 (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar))
8433 {
8434 if (reqcharflags < 0)
8435 {
8436 reqchar = firstchar;
8437 reqcharflags = firstcharflags;
8438 }
8439 firstcharflags = REQ_NONE;
8440 }
8441
8442 /* If we (now or from before) have no firstchar, a firstchar from the
8443 branch becomes a reqchar if there isn't a branch reqchar. */
8444
8445 if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0)
8446 {
8447 branchreqchar = branchfirstchar;
8448 branchreqcharflags = branchfirstcharflags;
8449 }
8450
8451 /* Now ensure that the reqchars match */
8452
8453 if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) ||
8454 reqchar != branchreqchar)
8455 reqcharflags = REQ_NONE;
8456 else
8457 {
8458 reqchar = branchreqchar;
8459 reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */
8460 }
8461 }
8462
8463 /* If lookbehind, check that this branch matches a fixed-length string, and
8464 put the length into the OP_REVERSE item. Temporarily mark the end of the
8465 branch with OP_END. If the branch contains OP_RECURSE, the result is -3
8466 because there may be forward references that we can't check here. Set a
8467 flag to cause another lookbehind check at the end. Why not do it all at the
8468 end? Because common, erroneous checks are picked up here and the offset of
8469 the problem can be shown. */
8470
8471 if (lookbehind)
8472 {
8473 int fixed_length;
8474 *code = OP_END;
8475 fixed_length = find_fixedlength(last_branch, (options & PCRE_UTF8) != 0,
8476 FALSE, cd, NULL);
8477 DPRINTF(("fixed length = %d\n", fixed_length));
8478 if (fixed_length == -3)
8479 {
8480 cd->check_lookbehind = TRUE;
8481 }
8482 else if (fixed_length < 0)
8483 {
8484 *errorcodeptr = (fixed_length == -2)? ERR36 :
8485 (fixed_length == -4)? ERR70: ERR25;
8486 *ptrptr = ptr;
8487 return FALSE;
8488 }
8489 else
8490 {
8491 if (fixed_length > cd->max_lookbehind)
8492 cd->max_lookbehind = fixed_length;
8493 PUT(reverse_count, 0, fixed_length);
8494 }
8495 }
8496 }
8497
8498 /* Reached end of expression, either ')' or end of pattern. In the real
8499 compile phase, go back through the alternative branches and reverse the chain
8500 of offsets, with the field in the BRA item now becoming an offset to the
8501 first alternative. If there are no alternatives, it points to the end of the
8502 group. The length in the terminating ket is always the length of the whole
8503 bracketed item. Return leaving the pointer at the terminating char. */
8504
8505 if (*ptr != CHAR_VERTICAL_LINE)
8506 {
8507 if (lengthptr == NULL)
8508 {
8509 int branch_length = (int)(code - last_branch);
8510 do
8511 {
8512 int prev_length = GET(last_branch, 1);
8513 PUT(last_branch, 1, branch_length);
8514 branch_length = prev_length;
8515 last_branch -= branch_length;
8516 }
8517 while (branch_length > 0);
8518 }
8519
8520 /* Fill in the ket */
8521
8522 *code = OP_KET;
8523 PUT(code, 1, (int)(code - start_bracket));
8524 code += 1 + LINK_SIZE;
8525
8526 /* If it was a capturing subpattern, check to see if it contained any
8527 recursive back references. If so, we must wrap it in atomic brackets.
8528 Because we are moving code along, we must ensure that any pending recursive
8529 references are updated. In any event, remove the block from the chain. */
8530
8531 if (capnumber > 0)
8532 {
8533 if (cd->open_caps->flag)
8534 {
8535 *code = OP_END;
8536 adjust_recurse(start_bracket, 1 + LINK_SIZE,
8537 (options & PCRE_UTF8) != 0, cd, save_hwm_offset);
8538 memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
8539 IN_UCHARS(code - start_bracket));
8540 *start_bracket = OP_ONCE;
8541 code += 1 + LINK_SIZE;
8542 PUT(start_bracket, 1, (int)(code - start_bracket));
8543 *code = OP_KET;
8544 PUT(code, 1, (int)(code - start_bracket));
8545 code += 1 + LINK_SIZE;
8546 length += 2 + 2*LINK_SIZE;
8547 }
8548 cd->open_caps = cd->open_caps->next;
8549 }
8550
8551 /* Retain the highest bracket number, in case resetting was used. */
8552
8553 cd->bracount = max_bracount;
8554
8555 /* Set values to pass back */
8556
8557 *codeptr = code;
8558 *ptrptr = ptr;
8559 *firstcharptr = firstchar;
8560 *firstcharflagsptr = firstcharflags;
8561 *reqcharptr = reqchar;
8562 *reqcharflagsptr = reqcharflags;
8563 if (lengthptr != NULL)
8564 {
8565 if (OFLOW_MAX - *lengthptr < length)
8566 {
8567 *errorcodeptr = ERR20;
8568 return FALSE;
8569 }
8570 *lengthptr += length;
8571 }
8572 return TRUE;
8573 }
8574
8575 /* Another branch follows. In the pre-compile phase, we can move the code
8576 pointer back to where it was for the start of the first branch. (That is,
8577 pretend that each branch is the only one.)
8578
8579 In the real compile phase, insert an ALT node. Its length field points back
8580 to the previous branch while the bracket remains open. At the end the chain
8581 is reversed. It's done like this so that the start of the bracket has a
8582 zero offset until it is closed, making it possible to detect recursion. */
8583
8584 if (lengthptr != NULL)
8585 {
8586 code = *codeptr + 1 + LINK_SIZE + skipbytes;
8587 length += 1 + LINK_SIZE;
8588 }
8589 else
8590 {
8591 *code = OP_ALT;
8592 PUT(code, 1, (int)(code - last_branch));
8593 bc.current_branch = last_branch = code;
8594 code += 1 + LINK_SIZE;
8595 }
8596
8597 ptr++;
8598 }
8599/* Control never reaches here */
8600}
8601
8602
8603
8604
8605/*************************************************
8606* Check for anchored expression *
8607*************************************************/
8608
8609/* Try to find out if this is an anchored regular expression. Consider each
8610alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8611all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8612it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8613be found, because ^ generates OP_CIRCM in that mode.
8614
8615We can also consider a regex to be anchored if OP_SOM starts all its branches.
8616This is the code for \G, which means "match at start of match position, taking
8617into account the match offset".
8618
8619A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8620because that will try the rest of the pattern at all possible matching points,
8621so there is no point trying again.... er ....
8622
8623.... except when the .* appears inside capturing parentheses, and there is a
8624subsequent back reference to those parentheses. We haven't enough information
8625to catch that case precisely.
8626
8627At first, the best we could do was to detect when .* was in capturing brackets
8628and the highest back reference was greater than or equal to that level.
8629However, by keeping a bitmap of the first 31 back references, we can catch some
8630of the more common cases more precisely.
8631
8632... A second exception is when the .* appears inside an atomic group, because
8633this prevents the number of characters it matches from being adjusted.
8634
8635Arguments:
8636 code points to start of expression (the bracket)
8637 bracket_map a bitmap of which brackets we are inside while testing; this
8638 handles up to substring 31; after that we just have to take
8639 the less precise approach
8640 cd points to the compile data block
8641 atomcount atomic group level
8642
8643Returns: TRUE or FALSE
8644*/
8645
8646static BOOL
8647is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
8648 compile_data *cd, int atomcount)
8649{
8650do {
8651 const pcre_uchar *scode = first_significant_code(
8652 code + PRIV(OP_lengths)[*code], FALSE);
8653 register int op = *scode;
8654
8655 /* Non-capturing brackets */
8656
8657 if (op == OP_BRA || op == OP_BRAPOS ||
8658 op == OP_SBRA || op == OP_SBRAPOS)
8659 {
8660 if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8661 }
8662
8663 /* Capturing brackets */
8664
8665 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8666 op == OP_SCBRA || op == OP_SCBRAPOS)
8667 {
8668 int n = GET2(scode, 1+LINK_SIZE);
8669 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8670 if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
8671 }
8672
8673 /* Positive forward assertions and conditions */
8674
8675 else if (op == OP_ASSERT || op == OP_COND)
8676 {
8677 if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8678 }
8679
8680 /* Atomic groups */
8681
8682 else if (op == OP_ONCE || op == OP_ONCE_NC)
8683 {
8684 if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
8685 return FALSE;
8686 }
8687
8688 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8689 it isn't in brackets that are or may be referenced or inside an atomic
8690 group. */
8691
8692 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8693 op == OP_TYPEPOSSTAR))
8694 {
8695 if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
8696 atomcount > 0 || cd->had_pruneorskip)
8697 return FALSE;
8698 }
8699
8700 /* Check for explicit anchoring */
8701
8702 else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8703
8704 code += GET(code, 1);
8705 }
8706while (*code == OP_ALT); /* Loop for each alternative */
8707return TRUE;
8708}
8709
8710
8711
8712/*************************************************
8713* Check for starting with ^ or .* *
8714*************************************************/
8715
8716/* This is called to find out if every branch starts with ^ or .* so that
8717"first char" processing can be done to speed things up in multiline
8718matching and for non-DOTALL patterns that start with .* (which must start at
8719the beginning or after \n). As in the case of is_anchored() (see above), we
8720have to take account of back references to capturing brackets that contain .*
8721because in that case we can't make the assumption. Also, the appearance of .*
8722inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8723or *SKIP does not count, because once again the assumption no longer holds.
8724
8725Arguments:
8726 code points to start of expression (the bracket)
8727 bracket_map a bitmap of which brackets we are inside while testing; this
8728 handles up to substring 31; after that we just have to take
8729 the less precise approach
8730 cd points to the compile data
8731 atomcount atomic group level
8732 inassert TRUE if in an assertion
8733
8734Returns: TRUE or FALSE
8735*/
8736
8737static BOOL
8738is_startline(const pcre_uchar *code, unsigned int bracket_map,
8739 compile_data *cd, int atomcount, BOOL inassert)
8740{
8741do {
8742 const pcre_uchar *scode = first_significant_code(
8743 code + PRIV(OP_lengths)[*code], FALSE);
8744 register int op = *scode;
8745
8746 /* If we are at the start of a conditional assertion group, *both* the
8747 conditional assertion *and* what follows the condition must satisfy the test
8748 for start of line. Other kinds of condition fail. Note that there may be an
8749 auto-callout at the start of a condition. */
8750
8751 if (op == OP_COND)
8752 {
8753 scode += 1 + LINK_SIZE;
8754 if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8755 switch (*scode)
8756 {
8757 case OP_CREF:
8758 case OP_DNCREF:
8759 case OP_RREF:
8760 case OP_DNRREF:
8761 case OP_DEF:
8762 case OP_FAIL:
8763 return FALSE;
8764
8765 default: /* Assertion */
8766 if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
8767 do scode += GET(scode, 1); while (*scode == OP_ALT);
8768 scode += 1 + LINK_SIZE;
8769 break;
8770 }
8771 scode = first_significant_code(scode, FALSE);
8772 op = *scode;
8773 }
8774
8775 /* Non-capturing brackets */
8776
8777 if (op == OP_BRA || op == OP_BRAPOS ||
8778 op == OP_SBRA || op == OP_SBRAPOS)
8779 {
8780 if (!is_startline(scode, bracket_map, cd, atomcount, inassert)) return FALSE;
8781 }
8782
8783 /* Capturing brackets */
8784
8785 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8786 op == OP_SCBRA || op == OP_SCBRAPOS)
8787 {
8788 int n = GET2(scode, 1+LINK_SIZE);
8789 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8790 if (!is_startline(scode, new_map, cd, atomcount, inassert)) return FALSE;
8791 }
8792
8793 /* Positive forward assertions */
8794
8795 else if (op == OP_ASSERT)
8796 {
8797 if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
8798 }
8799
8800 /* Atomic brackets */
8801
8802 else if (op == OP_ONCE || op == OP_ONCE_NC)
8803 {
8804 if (!is_startline(scode, bracket_map, cd, atomcount + 1, inassert)) return FALSE;
8805 }
8806
8807 /* .* means "start at start or after \n" if it isn't in atomic brackets or
8808 brackets that may be referenced or an assertion, as long as the pattern does
8809 not contain *PRUNE or *SKIP, because these break the feature. Consider, for
8810 example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e.
8811 not at the start of a line. */
8812
8813 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8814 {
8815 if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
8816 atomcount > 0 || cd->had_pruneorskip || inassert)
8817 return FALSE;
8818 }
8819
8820 /* Check for explicit circumflex; anything else gives a FALSE result. Note
8821 in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8822 because the number of characters matched by .* cannot be adjusted inside
8823 them. */
8824
8825 else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8826
8827 /* Move on to the next alternative */
8828
8829 code += GET(code, 1);
8830 }
8831while (*code == OP_ALT); /* Loop for each alternative */
8832return TRUE;
8833}
8834
8835
8836
8837/*************************************************
8838* Check for asserted fixed first char *
8839*************************************************/
8840
8841/* During compilation, the "first char" settings from forward assertions are
8842discarded, because they can cause conflicts with actual literals that follow.
8843However, if we end up without a first char setting for an unanchored pattern,
8844it is worth scanning the regex to see if there is an initial asserted first
8845char. If all branches start with the same asserted char, or with a
8846non-conditional bracket all of whose alternatives start with the same asserted
8847char (recurse ad lib), then we return that char, with the flags set to zero or
8848REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
8849
8850Arguments:
8851 code points to start of expression (the bracket)
8852 flags points to the first char flags, or to REQ_NONE
8853 inassert TRUE if in an assertion
8854
8855Returns: the fixed first char, or 0 with REQ_NONE in flags
8856*/
8857
8858static pcre_uint32
8859find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags,
8860 BOOL inassert)
8861{
8862register pcre_uint32 c = 0;
8863int cflags = REQ_NONE;
8864
8865*flags = REQ_NONE;
8866do {
8867 pcre_uint32 d;
8868 int dflags;
8869 int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8870 *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8871 const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
8872 TRUE);
8873 register pcre_uchar op = *scode;
8874
8875 switch(op)
8876 {
8877 default:
8878 return 0;
8879
8880 case OP_BRA:
8881 case OP_BRAPOS:
8882 case OP_CBRA:
8883 case OP_SCBRA:
8884 case OP_CBRAPOS:
8885 case OP_SCBRAPOS:
8886 case OP_ASSERT:
8887 case OP_ONCE:
8888 case OP_ONCE_NC:
8889 d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
8890 if (dflags < 0)
8891 return 0;
8892 if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0;
8893 break;
8894
8895 case OP_EXACT:
8896 scode += IMM2_SIZE;
8897 /* Fall through */
8898
8899 case OP_CHAR:
8900 case OP_PLUS:
8901 case OP_MINPLUS:
8902 case OP_POSPLUS:
8903 if (!inassert) return 0;
8904 if (cflags < 0) { c = scode[1]; cflags = 0; }
8905 else if (c != scode[1]) return 0;
8906 break;
8907
8908 case OP_EXACTI:
8909 scode += IMM2_SIZE;
8910 /* Fall through */
8911
8912 case OP_CHARI:
8913 case OP_PLUSI:
8914 case OP_MINPLUSI:
8915 case OP_POSPLUSI:
8916 if (!inassert) return 0;
8917 if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8918 else if (c != scode[1]) return 0;
8919 break;
8920 }
8921
8922 code += GET(code, 1);
8923 }
8924while (*code == OP_ALT);
8925
8926*flags = cflags;
8927return c;
8928}
8929
8930
8931
8932/*************************************************
8933* Add an entry to the name/number table *
8934*************************************************/
8935
8936/* This function is called between compiling passes to add an entry to the
8937name/number table, maintaining alphabetical order. Checking for permitted
8938and forbidden duplicates has already been done.
8939
8940Arguments:
8941 cd the compile data block
8942 name the name to add
8943 length the length of the name
8944 groupno the group number
8945
8946Returns: nothing
8947*/
8948
8949static void
8950add_name(compile_data *cd, const pcre_uchar *name, int length,
8951 unsigned int groupno)
8952{
8953int i;
8954pcre_uchar *slot = cd->name_table;
8955
8956for (i = 0; i < cd->names_found; i++)
8957 {
8958 int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
8959 if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8960 crc = -1; /* Current name is a substring */
8961
8962 /* Make space in the table and break the loop for an earlier name. For a
8963 duplicate or later name, carry on. We do this for duplicates so that in the
8964 simple case (when ?(| is not used) they are in order of their numbers. In all
8965 cases they are in the order in which they appear in the pattern. */
8966
8967 if (crc < 0)
8968 {
8969 memmove(slot + cd->name_entry_size, slot,
8970 IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
8971 break;
8972 }
8973
8974 /* Continue the loop for a later or duplicate name */
8975
8976 slot += cd->name_entry_size;
8977 }
8978
8979PUT2(slot, 0, groupno);
8980memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
8981slot[IMM2_SIZE + length] = 0;
8982cd->names_found++;
8983}
8984
8985
8986
8987/*************************************************
8988* Compile a Regular Expression *
8989*************************************************/
8990
8991/* This function takes a string and returns a pointer to a block of store
8992holding a compiled version of the expression. The original API for this
8993function had no error code return variable; it is retained for backwards
8994compatibility. The new function is given a new name.
8995
8996Arguments:
8997 pattern the regular expression
8998 options various option bits
8999 errorcodeptr pointer to error code variable (pcre_compile2() only)
9000 can be NULL if you don't want a code value
9001 errorptr pointer to pointer to error text
9002 erroroffset ptr offset in pattern where error was detected
9003 tables pointer to character tables or NULL
9004
9005Returns: pointer to compiled data block, or NULL on error,
9006 with errorptr and erroroffset set
9007*/
9008
9009#if defined COMPILE_PCRE8
9010PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
9011pcre_compile(const char *pattern, int options, const char **errorptr,
9012 int *erroroffset, const unsigned char *tables)
9013#elif defined COMPILE_PCRE16
9014PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
9015pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
9016 int *erroroffset, const unsigned char *tables)
9017#elif defined COMPILE_PCRE32
9018PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
9019pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
9020 int *erroroffset, const unsigned char *tables)
9021#endif
9022{
9023#if defined COMPILE_PCRE8
9024return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9025#elif defined COMPILE_PCRE16
9026return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9027#elif defined COMPILE_PCRE32
9028return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9029#endif
9030}
9031
9032
9033#if defined COMPILE_PCRE8
9034PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
9035pcre_compile2(const char *pattern, int options, int *errorcodeptr,
9036 const char **errorptr, int *erroroffset, const unsigned char *tables)
9037#elif defined COMPILE_PCRE16
9038PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
9039pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
9040 const char **errorptr, int *erroroffset, const unsigned char *tables)
9041#elif defined COMPILE_PCRE32
9042PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
9043pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
9044 const char **errorptr, int *erroroffset, const unsigned char *tables)
9045#endif
9046{
9047REAL_PCRE *re;
9048int length = 1; /* For final END opcode */
9049pcre_int32 firstcharflags, reqcharflags;
9050pcre_uint32 firstchar, reqchar;
9051pcre_uint32 limit_match = PCRE_UINT32_MAX;
9052pcre_uint32 limit_recursion = PCRE_UINT32_MAX;
9053int newline;
9054int errorcode = 0;
9055int skipatstart = 0;
9056BOOL utf;
9057BOOL never_utf = FALSE;
9058size_t size;
9059pcre_uchar *code;
9060const pcre_uchar *codestart;
9061const pcre_uchar *ptr;
9062compile_data compile_block;
9063compile_data *cd = &compile_block;
9064
9065/* This space is used for "compiling" into during the first phase, when we are
9066computing the amount of memory that is needed. Compiled items are thrown away
9067as soon as possible, so that a fairly large buffer should be sufficient for
9068this purpose. The same space is used in the second phase for remembering where
9069to fill in forward references to subpatterns. That may overflow, in which case
9070new memory is obtained from malloc(). */
9071
9072pcre_uchar cworkspace[COMPILE_WORK_SIZE];
9073
9074/* This vector is used for remembering name groups during the pre-compile. In a
9075similar way to cworkspace, it can be expanded using malloc() if necessary. */
9076
9077named_group named_groups[NAMED_GROUP_LIST_SIZE];
9078
9079/* Set this early so that early errors get offset 0. */
9080
9081ptr = (const pcre_uchar *)pattern;
9082
9083/* We can't pass back an error message if errorptr is NULL; I guess the best we
9084can do is just return NULL, but we can set a code value if there is a code
9085pointer. */
9086
9087if (errorptr == NULL)
9088 {
9089 if (errorcodeptr != NULL) *errorcodeptr = 99;
9090 return NULL;
9091 }
9092
9093*errorptr = NULL;
9094if (errorcodeptr != NULL) *errorcodeptr = ERR0;
9095
9096/* However, we can give a message for this error */
9097
9098if (erroroffset == NULL)
9099 {
9100 errorcode = ERR16;
9101 goto PCRE_EARLY_ERROR_RETURN2;
9102 }
9103
9104*erroroffset = 0;
9105
9106/* Set up pointers to the individual character tables */
9107
9108if (tables == NULL) tables = PRIV(default_tables);
9109cd->lcc = tables + lcc_offset;
9110cd->fcc = tables + fcc_offset;
9111cd->cbits = tables + cbits_offset;
9112cd->ctypes = tables + ctypes_offset;
9113
9114/* Check that all undefined public option bits are zero */
9115
9116if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
9117 {
9118 errorcode = ERR17;
9119 goto PCRE_EARLY_ERROR_RETURN;
9120 }
9121
9122/* If PCRE_NEVER_UTF is set, remember it. */
9123
9124if ((options & PCRE_NEVER_UTF) != 0) never_utf = TRUE;
9125
9126/* Check for global one-time settings at the start of the pattern, and remember
9127the offset for later. */
9128
9129cd->external_flags = 0; /* Initialize here for LIMIT_MATCH/RECURSION */
9130
9131while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9132 ptr[skipatstart+1] == CHAR_ASTERISK)
9133 {
9134 int newnl = 0;
9135 int newbsr = 0;
9136
9137/* For completeness and backward compatibility, (*UTFn) is supported in the
9138relevant libraries, but (*UTF) is generic and always supported. Note that
9139PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
9140
9141#ifdef COMPILE_PCRE8
9142 if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
9143 { skipatstart += 7; options |= PCRE_UTF8; continue; }
9144#endif
9145#ifdef COMPILE_PCRE16
9146 if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0)
9147 { skipatstart += 8; options |= PCRE_UTF16; continue; }
9148#endif
9149#ifdef COMPILE_PCRE32
9150 if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0)
9151 { skipatstart += 8; options |= PCRE_UTF32; continue; }
9152#endif
9153
9154 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0)
9155 { skipatstart += 6; options |= PCRE_UTF8; continue; }
9156 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
9157 { skipatstart += 6; options |= PCRE_UCP; continue; }
9158 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
9159 { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
9160 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
9161 { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
9162
9163 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_MATCH_EQ, 12) == 0)
9164 {
9165 pcre_uint32 c = 0;
9166 int p = skipatstart + 14;
9167 while (isdigit(ptr[p]))
9168 {
9169 if (c > PCRE_UINT32_MAX / 10 - 1) break; /* Integer overflow */
9170 c = c*10 + ptr[p++] - CHAR_0;
9171 }
9172 if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9173 if (c < limit_match)
9174 {
9175 limit_match = c;
9176 cd->external_flags |= PCRE_MLSET;
9177 }
9178 skipatstart = p;
9179 continue;
9180 }
9181
9182 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_RECURSION_EQ, 16) == 0)
9183 {
9184 pcre_uint32 c = 0;
9185 int p = skipatstart + 18;
9186 while (isdigit(ptr[p]))
9187 {
9188 if (c > PCRE_UINT32_MAX / 10 - 1) break; /* Integer overflow check */
9189 c = c*10 + ptr[p++] - CHAR_0;
9190 }
9191 if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9192 if (c < limit_recursion)
9193 {
9194 limit_recursion = c;
9195 cd->external_flags |= PCRE_RLSET;
9196 }
9197 skipatstart = p;
9198 continue;
9199 }
9200
9201 if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
9202 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
9203 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3) == 0)
9204 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
9205 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5) == 0)
9206 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
9207 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
9208 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
9209 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
9210 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
9211
9212 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
9213 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
9214 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
9215 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
9216
9217 if (newnl != 0)
9218 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
9219 else if (newbsr != 0)
9220 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
9221 else break;
9222 }
9223
9224/* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
9225utf = (options & PCRE_UTF8) != 0;
9226if (utf && never_utf)
9227 {
9228 errorcode = ERR78;
9229 goto PCRE_EARLY_ERROR_RETURN2;
9230 }
9231
9232/* Can't support UTF unless PCRE has been compiled to include the code. The
9233return of an error code from PRIV(valid_utf)() is a new feature, introduced in
9234release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
9235not used here. */
9236
9237#ifdef SUPPORT_UTF
9238if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
9239 (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
9240 {
9241#if defined COMPILE_PCRE8
9242 errorcode = ERR44;
9243#elif defined COMPILE_PCRE16
9244 errorcode = ERR74;
9245#elif defined COMPILE_PCRE32
9246 errorcode = ERR77;
9247#endif
9248 goto PCRE_EARLY_ERROR_RETURN2;
9249 }
9250#else
9251if (utf)
9252 {
9253 errorcode = ERR32;
9254 goto PCRE_EARLY_ERROR_RETURN;
9255 }
9256#endif
9257
9258/* Can't support UCP unless PCRE has been compiled to include the code. */
9259
9260#ifndef SUPPORT_UCP
9261if ((options & PCRE_UCP) != 0)
9262 {
9263 errorcode = ERR67;
9264 goto PCRE_EARLY_ERROR_RETURN;
9265 }
9266#endif
9267
9268/* Check validity of \R options. */
9269
9270if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
9271 (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
9272 {
9273 errorcode = ERR56;
9274 goto PCRE_EARLY_ERROR_RETURN;
9275 }
9276
9277/* Handle different types of newline. The three bits give seven cases. The
9278current code allows for fixed one- or two-byte sequences, plus "any" and
9279"anycrlf". */
9280
9281switch (options & PCRE_NEWLINE_BITS)
9282 {
9283 case 0: newline = NEWLINE; break; /* Build-time default */
9284 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
9285 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
9286 case PCRE_NEWLINE_CR+
9287 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
9288 case PCRE_NEWLINE_ANY: newline = -1; break;
9289 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
9290 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
9291 }
9292
9293if (newline == -2)
9294 {
9295 cd->nltype = NLTYPE_ANYCRLF;
9296 }
9297else if (newline < 0)
9298 {
9299 cd->nltype = NLTYPE_ANY;
9300 }
9301else
9302 {
9303 cd->nltype = NLTYPE_FIXED;
9304 if (newline > 255)
9305 {
9306 cd->nllen = 2;
9307 cd->nl[0] = (newline >> 8) & 255;
9308 cd->nl[1] = newline & 255;
9309 }
9310 else
9311 {
9312 cd->nllen = 1;
9313 cd->nl[0] = newline;
9314 }
9315 }
9316
9317/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9318references to help in deciding whether (.*) can be treated as anchored or not.
9319*/
9320
9321cd->top_backref = 0;
9322cd->backref_map = 0;
9323
9324/* Reflect pattern for debugging output */
9325
9326DPRINTF(("------------------------------------------------------------------\n"));
9327#ifdef PCRE_DEBUG
9328print_puchar(stdout, (PCRE_PUCHAR)pattern);
9329#endif
9330DPRINTF(("\n"));
9331
9332/* Pretend to compile the pattern while actually just accumulating the length
9333of memory required. This behaviour is triggered by passing a non-NULL final
9334argument to compile_regex(). We pass a block of workspace (cworkspace) for it
9335to compile parts of the pattern into; the compiled code is discarded when it is
9336no longer needed, so hopefully this workspace will never overflow, though there
9337is a test for its doing so. */
9338
9339cd->bracount = cd->final_bracount = 0;
9340cd->names_found = 0;
9341cd->name_entry_size = 0;
9342cd->name_table = NULL;
9343cd->dupnames = FALSE;
9344cd->dupgroups = FALSE;
9345cd->namedrefcount = 0;
9346cd->start_code = cworkspace;
9347cd->hwm = cworkspace;
9348cd->iscondassert = FALSE;
9349cd->start_workspace = cworkspace;
9350cd->workspace_size = COMPILE_WORK_SIZE;
9351cd->named_groups = named_groups;
9352cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
9353cd->start_pattern = (const pcre_uchar *)pattern;
9354cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
9355cd->req_varyopt = 0;
9356cd->parens_depth = 0;
9357cd->assert_depth = 0;
9358cd->max_lookbehind = 0;
9359cd->external_options = options;
9360cd->open_caps = NULL;
9361
9362/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
9363don't need to look at the result of the function here. The initial options have
9364been put into the cd block so that they can be changed if an option setting is
9365found within the regex right at the beginning. Bringing initial option settings
9366outside can help speed up starting point checks. */
9367
9368ptr += skipatstart;
9369code = cworkspace;
9370*code = OP_BRA;
9371
9372(void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9373 FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9374 cd, &length);
9375if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
9376
9377DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
9378 (int)(cd->hwm - cworkspace)));
9379
9380if (length > MAX_PATTERN_SIZE)
9381 {
9382 errorcode = ERR20;
9383 goto PCRE_EARLY_ERROR_RETURN;
9384 }
9385
9386/* Compute the size of the data block for storing the compiled pattern. Integer
9387overflow should no longer be possible because nowadays we limit the maximum
9388value of cd->names_found and cd->name_entry_size. */
9389
9390size = sizeof(REAL_PCRE) +
9391 (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
9392
9393/* Get the memory. */
9394
9395re = (REAL_PCRE *)(PUBL(malloc))(size);
9396if (re == NULL)
9397 {
9398 errorcode = ERR21;
9399 goto PCRE_EARLY_ERROR_RETURN;
9400 }
9401
9402/* Put in the magic number, and save the sizes, initial options, internal
9403flags, and character table pointer. NULL is used for the default character
9404tables. The nullpad field is at the end; it's there to help in the case when a
9405regex compiled on a system with 4-byte pointers is run on another with 8-byte
9406pointers. */
9407
9408re->magic_number = MAGIC_NUMBER;
9409re->size = (int)size;
9410re->options = cd->external_options;
9411re->flags = cd->external_flags;
9412re->limit_match = limit_match;
9413re->limit_recursion = limit_recursion;
9414re->first_char = 0;
9415re->req_char = 0;
9416re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
9417re->name_entry_size = cd->name_entry_size;
9418re->name_count = cd->names_found;
9419re->ref_count = 0;
9420re->tables = (tables == PRIV(default_tables))? NULL : tables;
9421re->nullpad = NULL;
9422#ifdef COMPILE_PCRE32
9423re->dummy = 0;
9424#else
9425re->dummy1 = re->dummy2 = re->dummy3 = 0;
9426#endif
9427
9428/* The starting points of the name/number translation table and of the code are
9429passed around in the compile data block. The start/end pattern and initial
9430options are already set from the pre-compile phase, as is the name_entry_size
9431field. Reset the bracket count and the names_found field. Also reset the hwm
9432field; this time it's used for remembering forward references to subpatterns.
9433*/
9434
9435cd->final_bracount = cd->bracount; /* Save for checking forward references */
9436cd->parens_depth = 0;
9437cd->assert_depth = 0;
9438cd->bracount = 0;
9439cd->max_lookbehind = 0;
9440cd->name_table = (pcre_uchar *)re + re->name_table_offset;
9441codestart = cd->name_table + re->name_entry_size * re->name_count;
9442cd->start_code = codestart;
9443cd->hwm = (pcre_uchar *)(cd->start_workspace);
9444cd->iscondassert = FALSE;
9445cd->req_varyopt = 0;
9446cd->had_accept = FALSE;
9447cd->had_pruneorskip = FALSE;
9448cd->check_lookbehind = FALSE;
9449cd->open_caps = NULL;
9450
9451/* If any named groups were found, create the name/number table from the list
9452created in the first pass. */
9453
9454if (cd->names_found > 0)
9455 {
9456 int i = cd->names_found;
9457 named_group *ng = cd->named_groups;
9458 cd->names_found = 0;
9459 for (; i > 0; i--, ng++)
9460 add_name(cd, ng->name, ng->length, ng->number);
9461 if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
9462 (PUBL(free))((void *)cd->named_groups);
9463 }
9464
9465/* Set up a starting, non-extracting bracket, then compile the expression. On
9466error, errorcode will be set non-zero, so we don't need to look at the result
9467of the function here. */
9468
9469ptr = (const pcre_uchar *)pattern + skipatstart;
9470code = (pcre_uchar *)codestart;
9471*code = OP_BRA;
9472(void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
9473 &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
9474re->top_bracket = cd->bracount;
9475re->top_backref = cd->top_backref;
9476re->max_lookbehind = cd->max_lookbehind;
9477re->flags = cd->external_flags | PCRE_MODE;
9478
9479if (cd->had_accept)
9480 {
9481 reqchar = 0; /* Must disable after (*ACCEPT) */
9482 reqcharflags = REQ_NONE;
9483 }
9484
9485/* If not reached end of pattern on success, there's an excess bracket. */
9486
9487if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22;
9488
9489/* Fill in the terminating state and check for disastrous overflow, but
9490if debugging, leave the test till after things are printed out. */
9491
9492*code++ = OP_END;
9493
9494#ifndef PCRE_DEBUG
9495if (code - codestart > length) errorcode = ERR23;
9496#endif
9497
9498#ifdef SUPPORT_VALGRIND
9499/* If the estimated length exceeds the really used length, mark the extra
9500allocated memory as unaddressable, so that any out-of-bound reads can be
9501detected. */
9502VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
9503#endif
9504
9505/* Fill in any forward references that are required. There may be repeated
9506references; optimize for them, as searching a large regex takes time. */
9507
9508if (cd->hwm > cd->start_workspace)
9509 {
9510 int prev_recno = -1;
9511 const pcre_uchar *groupptr = NULL;
9512 while (errorcode == 0 && cd->hwm > cd->start_workspace)
9513 {
9514 int offset, recno;
9515 cd->hwm -= LINK_SIZE;
9516 offset = GET(cd->hwm, 0);
9517
9518 /* Check that the hwm handling hasn't gone wrong. This whole area is
9519 rewritten in PCRE2 because there are some obscure cases. */
9520
9521 if (offset == 0 || codestart[offset-1] != OP_RECURSE)
9522 {
9523 errorcode = ERR10;
9524 break;
9525 }
9526
9527 recno = GET(codestart, offset);
9528 if (recno != prev_recno)
9529 {
9530 groupptr = PRIV(find_bracket)(codestart, utf, recno);
9531 prev_recno = recno;
9532 }
9533 if (groupptr == NULL) errorcode = ERR53;
9534 else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
9535 }
9536 }
9537
9538/* If the workspace had to be expanded, free the new memory. Set the pointer to
9539NULL to indicate that forward references have been filled in. */
9540
9541if (cd->workspace_size > COMPILE_WORK_SIZE)
9542 (PUBL(free))((void *)cd->start_workspace);
9543cd->start_workspace = NULL;
9544
9545/* Give an error if there's back reference to a non-existent capturing
9546subpattern. */
9547
9548if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
9549
9550/* Unless disabled, check whether any single character iterators can be
9551auto-possessified. The function overwrites the appropriate opcode values, so
9552the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9553used in this code because at least one compiler gives a warning about loss of
9554"const" attribute if the cast (pcre_uchar *)codestart is used directly in the
9555function call. */
9556
9557if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0)
9558 {
9559 pcre_uchar *temp = (pcre_uchar *)codestart;
9560 auto_possessify(temp, utf, cd);
9561 }
9562
9563/* If there were any lookbehind assertions that contained OP_RECURSE
9564(recursions or subroutine calls), a flag is set for them to be checked here,
9565because they may contain forward references. Actual recursions cannot be fixed
9566length, but subroutine calls can. It is done like this so that those without
9567OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
9568exceptional ones forgo this. We scan the pattern to check that they are fixed
9569length, and set their lengths. */
9570
9571if (errorcode == 0 && cd->check_lookbehind)
9572 {
9573 pcre_uchar *cc = (pcre_uchar *)codestart;
9574
9575 /* Loop, searching for OP_REVERSE items, and process those that do not have
9576 their length set. (Actually, it will also re-process any that have a length
9577 of zero, but that is a pathological case, and it does no harm.) When we find
9578 one, we temporarily terminate the branch it is in while we scan it. */
9579
9580 for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
9581 cc != NULL;
9582 cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
9583 {
9584 if (GET(cc, 1) == 0)
9585 {
9586 int fixed_length;
9587 pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
9588 int end_op = *be;
9589 *be = OP_END;
9590 fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
9591 cd, NULL);
9592 *be = end_op;
9593 DPRINTF(("fixed length = %d\n", fixed_length));
9594 if (fixed_length < 0)
9595 {
9596 errorcode = (fixed_length == -2)? ERR36 :
9597 (fixed_length == -4)? ERR70 : ERR25;
9598 break;
9599 }
9600 if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
9601 PUT(cc, 1, fixed_length);
9602 }
9603 cc += 1 + LINK_SIZE;
9604 }
9605 }
9606
9607/* Failed to compile, or error while post-processing */
9608
9609if (errorcode != 0)
9610 {
9611 (PUBL(free))(re);
9612 PCRE_EARLY_ERROR_RETURN:
9613 *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
9614 PCRE_EARLY_ERROR_RETURN2:
9615 *errorptr = find_error_text(errorcode);
9616 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
9617 return NULL;
9618 }
9619
9620/* If the anchored option was not passed, set the flag if we can determine that
9621the pattern is anchored by virtue of ^ characters or \A or anything else, such
9622as starting with non-atomic .* when DOTALL is set and there are no occurrences
9623of *PRUNE or *SKIP.
9624
9625Otherwise, if we know what the first byte has to be, save it, because that
9626speeds up unanchored matches no end. If not, see if we can set the
9627PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
9628start with ^. and also when all branches start with non-atomic .* for
9629non-DOTALL matches when *PRUNE and SKIP are not present. */
9630
9631if ((re->options & PCRE_ANCHORED) == 0)
9632 {
9633 if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
9634 else
9635 {
9636 if (firstcharflags < 0)
9637 firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
9638 if (firstcharflags >= 0) /* Remove caseless flag for non-caseable chars */
9639 {
9640#if defined COMPILE_PCRE8
9641 re->first_char = firstchar & 0xff;
9642#elif defined COMPILE_PCRE16
9643 re->first_char = firstchar & 0xffff;
9644#elif defined COMPILE_PCRE32
9645 re->first_char = firstchar;
9646#endif
9647 if ((firstcharflags & REQ_CASELESS) != 0)
9648 {
9649#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9650 /* We ignore non-ASCII first chars in 8 bit mode. */
9651 if (utf)
9652 {
9653 if (re->first_char < 128)
9654 {
9655 if (cd->fcc[re->first_char] != re->first_char)
9656 re->flags |= PCRE_FCH_CASELESS;
9657 }
9658 else if (UCD_OTHERCASE(re->first_char) != re->first_char)
9659 re->flags |= PCRE_FCH_CASELESS;
9660 }
9661 else
9662#endif
9663 if (MAX_255(re->first_char)
9664 && cd->fcc[re->first_char] != re->first_char)
9665 re->flags |= PCRE_FCH_CASELESS;
9666 }
9667
9668 re->flags |= PCRE_FIRSTSET;
9669 }
9670
9671 else if (is_startline(codestart, 0, cd, 0, FALSE)) re->flags |= PCRE_STARTLINE;
9672 }
9673 }
9674
9675/* For an anchored pattern, we use the "required byte" only if it follows a
9676variable length item in the regex. Remove the caseless flag for non-caseable
9677bytes. */
9678
9679if (reqcharflags >= 0 &&
9680 ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0))
9681 {
9682#if defined COMPILE_PCRE8
9683 re->req_char = reqchar & 0xff;
9684#elif defined COMPILE_PCRE16
9685 re->req_char = reqchar & 0xffff;
9686#elif defined COMPILE_PCRE32
9687 re->req_char = reqchar;
9688#endif
9689 if ((reqcharflags & REQ_CASELESS) != 0)
9690 {
9691#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9692 /* We ignore non-ASCII first chars in 8 bit mode. */
9693 if (utf)
9694 {
9695 if (re->req_char < 128)
9696 {
9697 if (cd->fcc[re->req_char] != re->req_char)
9698 re->flags |= PCRE_RCH_CASELESS;
9699 }
9700 else if (UCD_OTHERCASE(re->req_char) != re->req_char)
9701 re->flags |= PCRE_RCH_CASELESS;
9702 }
9703 else
9704#endif
9705 if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
9706 re->flags |= PCRE_RCH_CASELESS;
9707 }
9708
9709 re->flags |= PCRE_REQCHSET;
9710 }
9711
9712/* Print out the compiled data if debugging is enabled. This is never the
9713case when building a production library. */
9714
9715#ifdef PCRE_DEBUG
9716printf("Length = %d top_bracket = %d top_backref = %d\n",
9717 length, re->top_bracket, re->top_backref);
9718
9719printf("Options=%08x\n", re->options);
9720
9721if ((re->flags & PCRE_FIRSTSET) != 0)
9722 {
9723 pcre_uchar ch = re->first_char;
9724 const char *caseless =
9725 ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
9726 if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
9727 else printf("First char = \\x%02x%s\n", ch, caseless);
9728 }
9729
9730if ((re->flags & PCRE_REQCHSET) != 0)
9731 {
9732 pcre_uchar ch = re->req_char;
9733 const char *caseless =
9734 ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
9735 if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
9736 else printf("Req char = \\x%02x%s\n", ch, caseless);
9737 }
9738
9739#if defined COMPILE_PCRE8
9740pcre_printint((pcre *)re, stdout, TRUE);
9741#elif defined COMPILE_PCRE16
9742pcre16_printint((pcre *)re, stdout, TRUE);
9743#elif defined COMPILE_PCRE32
9744pcre32_printint((pcre *)re, stdout, TRUE);
9745#endif
9746
9747/* This check is done here in the debugging case so that the code that
9748was compiled can be seen. */
9749
9750if (code - codestart > length)
9751 {
9752 (PUBL(free))(re);
9753 *errorptr = find_error_text(ERR23);
9754 *erroroffset = ptr - (pcre_uchar *)pattern;
9755 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
9756 return NULL;
9757 }
9758#endif /* PCRE_DEBUG */
9759
9760/* Check for a pattern than can match an empty string, so that this information
9761can be provided to applications. */
9762
9763do
9764 {
9765 if (could_be_empty_branch(codestart, code, utf, cd, NULL))
9766 {
9767 re->flags |= PCRE_MATCH_EMPTY;
9768 break;
9769 }
9770 codestart += GET(codestart, 1);
9771 }
9772while (*codestart == OP_ALT);
9773
9774#if defined COMPILE_PCRE8
9775return (pcre *)re;
9776#elif defined COMPILE_PCRE16
9777return (pcre16 *)re;
9778#elif defined COMPILE_PCRE32
9779return (pcre32 *)re;
9780#endif
9781}
9782
9783/* End of pcre_compile.c */
9784
9785