1 | /************************************************* |
2 | * Perl-Compatible Regular Expressions * |
3 | *************************************************/ |
4 | |
5 | |
6 | /* PCRE is a library of functions to support regular expressions whose syntax |
7 | and semantics are as close as possible to those of the Perl 5 language. |
8 | |
9 | Written by Philip Hazel |
10 | Copyright (c) 1997-2016 University of Cambridge |
11 | |
12 | ----------------------------------------------------------------------------- |
13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions are met: |
15 | |
16 | * Redistributions of source code must retain the above copyright notice, |
17 | this list of conditions and the following disclaimer. |
18 | |
19 | * Redistributions in binary form must reproduce the above copyright |
20 | notice, this list of conditions and the following disclaimer in the |
21 | documentation and/or other materials provided with the distribution. |
22 | |
23 | * Neither the name of the University of Cambridge nor the names of its |
24 | contributors may be used to endorse or promote products derived from |
25 | this software without specific prior written permission. |
26 | |
27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | POSSIBILITY OF SUCH DAMAGE. |
38 | ----------------------------------------------------------------------------- |
39 | */ |
40 | |
41 | /* This header contains definitions that are shared between the different |
42 | modules, but which are not relevant to the exported API. This includes some |
43 | functions whose names all begin with "_pcre_", "_pcre16_" or "_pcre32_" |
44 | depending on the PRIV macro. */ |
45 | |
46 | #ifndef PCRE_INTERNAL_H |
47 | #define PCRE_INTERNAL_H |
48 | |
49 | /* Define PCRE_DEBUG to get debugging output on stdout. */ |
50 | |
51 | #if 0 |
52 | #define PCRE_DEBUG |
53 | #endif |
54 | |
55 | /* PCRE is compiled as an 8 bit library if it is not requested otherwise. */ |
56 | |
57 | #if !defined COMPILE_PCRE16 && !defined COMPILE_PCRE32 |
58 | #define COMPILE_PCRE8 |
59 | #endif |
60 | |
61 | /* If SUPPORT_UCP is defined, SUPPORT_UTF must also be defined. The |
62 | "configure" script ensures this, but not everybody uses "configure". */ |
63 | |
64 | #if defined SUPPORT_UCP && !(defined SUPPORT_UTF) |
65 | #define SUPPORT_UTF 1 |
66 | #endif |
67 | |
68 | /* We define SUPPORT_UTF if SUPPORT_UTF8 is enabled for compatibility |
69 | reasons with existing code. */ |
70 | |
71 | #if defined SUPPORT_UTF8 && !(defined SUPPORT_UTF) |
72 | #define SUPPORT_UTF 1 |
73 | #endif |
74 | |
75 | /* Fixme: SUPPORT_UTF8 should be eventually disappear from the code. |
76 | Until then we define it if SUPPORT_UTF is defined. */ |
77 | |
78 | #if defined SUPPORT_UTF && !(defined SUPPORT_UTF8) |
79 | #define SUPPORT_UTF8 1 |
80 | #endif |
81 | |
82 | /* We do not support both EBCDIC and UTF-8/16/32 at the same time. The "configure" |
83 | script prevents both being selected, but not everybody uses "configure". */ |
84 | |
85 | #if defined EBCDIC && defined SUPPORT_UTF |
86 | #error The use of both EBCDIC and SUPPORT_UTF is not supported. |
87 | #endif |
88 | |
89 | /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef |
90 | inline, and there are *still* stupid compilers about that don't like indented |
91 | pre-processor statements, or at least there were when I first wrote this. After |
92 | all, it had only been about 10 years then... |
93 | |
94 | It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so |
95 | be absolutely sure we get our version. */ |
96 | |
97 | #undef DPRINTF |
98 | #ifdef PCRE_DEBUG |
99 | #define DPRINTF(p) printf p |
100 | #else |
101 | #define DPRINTF(p) /* Nothing */ |
102 | #endif |
103 | |
104 | |
105 | /* Standard C headers plus the external interface definition. The only time |
106 | setjmp and stdarg are used is when NO_RECURSE is set. */ |
107 | |
108 | #include <ctype.h> |
109 | #include <limits.h> |
110 | #include <stddef.h> |
111 | #include <stdio.h> |
112 | #include <stdlib.h> |
113 | #include <string.h> |
114 | |
115 | /* Valgrind (memcheck) support */ |
116 | |
117 | #ifdef SUPPORT_VALGRIND |
118 | #include <valgrind/memcheck.h> |
119 | #endif |
120 | |
121 | /* When compiling a DLL for Windows, the exported symbols have to be declared |
122 | using some MS magic. I found some useful information on this web page: |
123 | http://msdn2.microsoft.com/en-us/library/y4h7bcy6(VS.80).aspx. According to the |
124 | information there, using __declspec(dllexport) without "extern" we have a |
125 | definition; with "extern" we have a declaration. The settings here override the |
126 | setting in pcre.h (which is included below); it defines only PCRE_EXP_DECL, |
127 | which is all that is needed for applications (they just import the symbols). We |
128 | use: |
129 | |
130 | PCRE_EXP_DECL for declarations |
131 | PCRE_EXP_DEFN for definitions of exported functions |
132 | PCRE_EXP_DATA_DEFN for definitions of exported variables |
133 | |
134 | The reason for the two DEFN macros is that in non-Windows environments, one |
135 | does not want to have "extern" before variable definitions because it leads to |
136 | compiler warnings. So we distinguish between functions and variables. In |
137 | Windows, the two should always be the same. |
138 | |
139 | The reason for wrapping this in #ifndef PCRE_EXP_DECL is so that pcretest, |
140 | which is an application, but needs to import this file in order to "peek" at |
141 | internals, can #include pcre.h first to get an application's-eye view. |
142 | |
143 | In principle, people compiling for non-Windows, non-Unix-like (i.e. uncommon, |
144 | special-purpose environments) might want to stick other stuff in front of |
145 | exported symbols. That's why, in the non-Windows case, we set PCRE_EXP_DEFN and |
146 | PCRE_EXP_DATA_DEFN only if they are not already set. */ |
147 | |
148 | #ifndef PCRE_EXP_DECL |
149 | # ifdef _WIN32 |
150 | # ifndef PCRE_STATIC |
151 | # define PCRE_EXP_DECL extern __declspec(dllexport) |
152 | # define PCRE_EXP_DEFN __declspec(dllexport) |
153 | # define PCRE_EXP_DATA_DEFN __declspec(dllexport) |
154 | # else |
155 | # define PCRE_EXP_DECL extern |
156 | # define PCRE_EXP_DEFN |
157 | # define PCRE_EXP_DATA_DEFN |
158 | # endif |
159 | # else |
160 | # ifdef __cplusplus |
161 | # define PCRE_EXP_DECL extern "C" |
162 | # else |
163 | # define PCRE_EXP_DECL extern |
164 | # endif |
165 | # ifndef PCRE_EXP_DEFN |
166 | # define PCRE_EXP_DEFN PCRE_EXP_DECL |
167 | # endif |
168 | # ifndef PCRE_EXP_DATA_DEFN |
169 | # define PCRE_EXP_DATA_DEFN |
170 | # endif |
171 | # endif |
172 | #endif |
173 | |
174 | /* When compiling with the MSVC compiler, it is sometimes necessary to include |
175 | a "calling convention" before exported function names. (This is secondhand |
176 | information; I know nothing about MSVC myself). For example, something like |
177 | |
178 | void __cdecl function(....) |
179 | |
180 | might be needed. In order so make this easy, all the exported functions have |
181 | PCRE_CALL_CONVENTION just before their names. It is rarely needed; if not |
182 | set, we ensure here that it has no effect. */ |
183 | |
184 | #ifndef PCRE_CALL_CONVENTION |
185 | #define PCRE_CALL_CONVENTION |
186 | #endif |
187 | |
188 | /* We need to have types that specify unsigned 8, 16 and 32-bit integers. We |
189 | cannot determine these outside the compilation (e.g. by running a program as |
190 | part of "configure") because PCRE is often cross-compiled for use on other |
191 | systems. Instead we make use of the maximum sizes that are available at |
192 | preprocessor time in standard C environments. */ |
193 | |
194 | typedef unsigned char pcre_uint8; |
195 | |
196 | #if USHRT_MAX == 65535 |
197 | typedef unsigned short pcre_uint16; |
198 | typedef short pcre_int16; |
199 | #define PCRE_UINT16_MAX USHRT_MAX |
200 | #define PCRE_INT16_MAX SHRT_MAX |
201 | #elif UINT_MAX == 65535 |
202 | typedef unsigned int pcre_uint16; |
203 | typedef int pcre_int16; |
204 | #define PCRE_UINT16_MAX UINT_MAX |
205 | #define PCRE_INT16_MAX INT_MAX |
206 | #else |
207 | #error Cannot determine a type for 16-bit integers |
208 | #endif |
209 | |
210 | #if UINT_MAX == 4294967295U |
211 | typedef unsigned int pcre_uint32; |
212 | typedef int pcre_int32; |
213 | #define PCRE_UINT32_MAX UINT_MAX |
214 | #define PCRE_INT32_MAX INT_MAX |
215 | #elif ULONG_MAX == 4294967295UL |
216 | typedef unsigned long int pcre_uint32; |
217 | typedef long int pcre_int32; |
218 | #define PCRE_UINT32_MAX ULONG_MAX |
219 | #define PCRE_INT32_MAX LONG_MAX |
220 | #else |
221 | #error Cannot determine a type for 32-bit integers |
222 | #endif |
223 | |
224 | /* When checking for integer overflow in pcre_compile(), we need to handle |
225 | large integers. If a 64-bit integer type is available, we can use that. |
226 | Otherwise we have to cast to double, which of course requires floating point |
227 | arithmetic. Handle this by defining a macro for the appropriate type. If |
228 | stdint.h is available, include it; it may define INT64_MAX. Systems that do not |
229 | have stdint.h (e.g. Solaris) may have inttypes.h. The macro int64_t may be set |
230 | by "configure". */ |
231 | |
232 | #if HAVE_STDINT_H |
233 | #include <stdint.h> |
234 | #elif HAVE_INTTYPES_H || defined(__SUNPRO_C) |
235 | #include <inttypes.h> |
236 | #endif |
237 | |
238 | #if defined INT64_MAX || defined int64_t |
239 | #define INT64_OR_DOUBLE int64_t |
240 | #else |
241 | #define INT64_OR_DOUBLE double |
242 | #endif |
243 | |
244 | /* All character handling must be done as unsigned characters. Otherwise there |
245 | are problems with top-bit-set characters and functions such as isspace(). |
246 | However, we leave the interface to the outside world as char * or short *, |
247 | because that should make things easier for callers. This character type is |
248 | called pcre_uchar. |
249 | |
250 | The IN_UCHARS macro multiply its argument with the byte size of the current |
251 | pcre_uchar type. Useful for memcpy and such operations, whose require the |
252 | byte size of their input/output buffers. |
253 | |
254 | The MAX_255 macro checks whether its pcre_uchar input is less than 256. |
255 | |
256 | The TABLE_GET macro is designed for accessing elements of tables whose contain |
257 | exactly 256 items. When the character is able to contain more than 256 |
258 | items, some check is needed before accessing these tables. |
259 | */ |
260 | |
261 | #if defined COMPILE_PCRE8 |
262 | |
263 | typedef unsigned char pcre_uchar; |
264 | #define IN_UCHARS(x) (x) |
265 | #define MAX_255(c) 1 |
266 | #define TABLE_GET(c, table, default) ((table)[c]) |
267 | |
268 | #elif defined COMPILE_PCRE16 |
269 | |
270 | #if USHRT_MAX != 65535 |
271 | /* This is a warning message. Change PCRE_UCHAR16 to a 16 bit data type in |
272 | pcre.h(.in) and disable (comment out) this message. */ |
273 | #error Warning: PCRE_UCHAR16 is not a 16 bit data type. |
274 | #endif |
275 | |
276 | typedef pcre_uint16 pcre_uchar; |
277 | #define UCHAR_SHIFT (1) |
278 | #define IN_UCHARS(x) ((x) * 2) |
279 | #define MAX_255(c) ((c) <= 255u) |
280 | #define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default)) |
281 | |
282 | #elif defined COMPILE_PCRE32 |
283 | |
284 | typedef pcre_uint32 pcre_uchar; |
285 | #define UCHAR_SHIFT (2) |
286 | #define IN_UCHARS(x) ((x) * 4) |
287 | #define MAX_255(c) ((c) <= 255u) |
288 | #define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default)) |
289 | |
290 | #else |
291 | #error Unsupported compiling mode |
292 | #endif /* COMPILE_PCRE[8|16|32] */ |
293 | |
294 | /* This is an unsigned int value that no character can ever have. UTF-8 |
295 | characters only go up to 0x7fffffff (though Unicode doesn't go beyond |
296 | 0x0010ffff). */ |
297 | |
298 | #define NOTACHAR 0xffffffff |
299 | |
300 | /* PCRE is able to support several different kinds of newline (CR, LF, CRLF, |
301 | "any" and "anycrlf" at present). The following macros are used to package up |
302 | testing for newlines. NLBLOCK, PSSTART, and PSEND are defined in the various |
303 | modules to indicate in which datablock the parameters exist, and what the |
304 | start/end of string field names are. */ |
305 | |
306 | #define NLTYPE_FIXED 0 /* Newline is a fixed length string */ |
307 | #define NLTYPE_ANY 1 /* Newline is any Unicode line ending */ |
308 | #define NLTYPE_ANYCRLF 2 /* Newline is CR, LF, or CRLF */ |
309 | |
310 | /* This macro checks for a newline at the given position */ |
311 | |
312 | #define IS_NEWLINE(p) \ |
313 | ((NLBLOCK->nltype != NLTYPE_FIXED)? \ |
314 | ((p) < NLBLOCK->PSEND && \ |
315 | PRIV(is_newline)((p), NLBLOCK->nltype, NLBLOCK->PSEND, \ |
316 | &(NLBLOCK->nllen), utf)) \ |
317 | : \ |
318 | ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \ |
319 | UCHAR21TEST(p) == NLBLOCK->nl[0] && \ |
320 | (NLBLOCK->nllen == 1 || UCHAR21TEST(p+1) == NLBLOCK->nl[1]) \ |
321 | ) \ |
322 | ) |
323 | |
324 | /* This macro checks for a newline immediately preceding the given position */ |
325 | |
326 | #define WAS_NEWLINE(p) \ |
327 | ((NLBLOCK->nltype != NLTYPE_FIXED)? \ |
328 | ((p) > NLBLOCK->PSSTART && \ |
329 | PRIV(was_newline)((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \ |
330 | &(NLBLOCK->nllen), utf)) \ |
331 | : \ |
332 | ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \ |
333 | UCHAR21TEST(p - NLBLOCK->nllen) == NLBLOCK->nl[0] && \ |
334 | (NLBLOCK->nllen == 1 || UCHAR21TEST(p - NLBLOCK->nllen + 1) == NLBLOCK->nl[1]) \ |
335 | ) \ |
336 | ) |
337 | |
338 | /* When PCRE is compiled as a C++ library, the subject pointer can be replaced |
339 | with a custom type. This makes it possible, for example, to allow pcre_exec() |
340 | to process subject strings that are discontinuous by using a smart pointer |
341 | class. It must always be possible to inspect all of the subject string in |
342 | pcre_exec() because of the way it backtracks. Two macros are required in the |
343 | normal case, for sign-unspecified and unsigned char pointers. The former is |
344 | used for the external interface and appears in pcre.h, which is why its name |
345 | must begin with PCRE_. */ |
346 | |
347 | #ifdef CUSTOM_SUBJECT_PTR |
348 | #define PCRE_PUCHAR CUSTOM_SUBJECT_PTR |
349 | #else |
350 | #define PCRE_PUCHAR const pcre_uchar * |
351 | #endif |
352 | |
353 | /* Include the public PCRE header and the definitions of UCP character property |
354 | values. */ |
355 | |
356 | #include "pcre.h" |
357 | #include "ucp.h" |
358 | |
359 | #ifdef COMPILE_PCRE32 |
360 | /* Assert that the public PCRE_UCHAR32 is a 32-bit type */ |
361 | typedef int __assert_pcre_uchar32_size[sizeof(PCRE_UCHAR32) == 4 ? 1 : -1]; |
362 | #endif |
363 | |
364 | /* When compiling for use with the Virtual Pascal compiler, these functions |
365 | need to have their names changed. PCRE must be compiled with the -DVPCOMPAT |
366 | option on the command line. */ |
367 | |
368 | #ifdef VPCOMPAT |
369 | #define strlen(s) _strlen(s) |
370 | #define strncmp(s1,s2,m) _strncmp(s1,s2,m) |
371 | #define memcmp(s,c,n) _memcmp(s,c,n) |
372 | #define memcpy(d,s,n) _memcpy(d,s,n) |
373 | #define memmove(d,s,n) _memmove(d,s,n) |
374 | #define memset(s,c,n) _memset(s,c,n) |
375 | #else /* VPCOMPAT */ |
376 | |
377 | /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(), |
378 | define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY |
379 | is set. Otherwise, include an emulating function for those systems that have |
380 | neither (there some non-Unix environments where this is the case). */ |
381 | |
382 | #ifndef HAVE_MEMMOVE |
383 | #undef memmove /* some systems may have a macro */ |
384 | #ifdef HAVE_BCOPY |
385 | #define memmove(a, b, c) bcopy(b, a, c) |
386 | #else /* HAVE_BCOPY */ |
387 | static void * |
388 | pcre_memmove(void *d, const void *s, size_t n) |
389 | { |
390 | size_t i; |
391 | unsigned char *dest = (unsigned char *)d; |
392 | const unsigned char *src = (const unsigned char *)s; |
393 | if (dest > src) |
394 | { |
395 | dest += n; |
396 | src += n; |
397 | for (i = 0; i < n; ++i) *(--dest) = *(--src); |
398 | return (void *)dest; |
399 | } |
400 | else |
401 | { |
402 | for (i = 0; i < n; ++i) *dest++ = *src++; |
403 | return (void *)(dest - n); |
404 | } |
405 | } |
406 | #define memmove(a, b, c) pcre_memmove(a, b, c) |
407 | #endif /* not HAVE_BCOPY */ |
408 | #endif /* not HAVE_MEMMOVE */ |
409 | #endif /* not VPCOMPAT */ |
410 | |
411 | |
412 | /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored |
413 | in big-endian order) by default. These are used, for example, to link from the |
414 | start of a subpattern to its alternatives and its end. The use of 2 bytes per |
415 | offset limits the size of the compiled regex to around 64K, which is big enough |
416 | for almost everybody. However, I received a request for an even bigger limit. |
417 | For this reason, and also to make the code easier to maintain, the storing and |
418 | loading of offsets from the byte string is now handled by the macros that are |
419 | defined here. |
420 | |
421 | The macros are controlled by the value of LINK_SIZE. This defaults to 2 in |
422 | the config.h file, but can be overridden by using -D on the command line. This |
423 | is automated on Unix systems via the "configure" command. */ |
424 | |
425 | #if defined COMPILE_PCRE8 |
426 | |
427 | #if LINK_SIZE == 2 |
428 | |
429 | #define PUT(a,n,d) \ |
430 | (a[n] = (d) >> 8), \ |
431 | (a[(n)+1] = (d) & 255) |
432 | |
433 | #define GET(a,n) \ |
434 | (((a)[n] << 8) | (a)[(n)+1]) |
435 | |
436 | #define MAX_PATTERN_SIZE (1 << 16) |
437 | |
438 | |
439 | #elif LINK_SIZE == 3 |
440 | |
441 | #define PUT(a,n,d) \ |
442 | (a[n] = (d) >> 16), \ |
443 | (a[(n)+1] = (d) >> 8), \ |
444 | (a[(n)+2] = (d) & 255) |
445 | |
446 | #define GET(a,n) \ |
447 | (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) |
448 | |
449 | #define MAX_PATTERN_SIZE (1 << 24) |
450 | |
451 | |
452 | #elif LINK_SIZE == 4 |
453 | |
454 | #define PUT(a,n,d) \ |
455 | (a[n] = (d) >> 24), \ |
456 | (a[(n)+1] = (d) >> 16), \ |
457 | (a[(n)+2] = (d) >> 8), \ |
458 | (a[(n)+3] = (d) & 255) |
459 | |
460 | #define GET(a,n) \ |
461 | (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) |
462 | |
463 | /* Keep it positive */ |
464 | #define MAX_PATTERN_SIZE (1 << 30) |
465 | |
466 | #else |
467 | #error LINK_SIZE must be either 2, 3, or 4 |
468 | #endif |
469 | |
470 | #elif defined COMPILE_PCRE16 |
471 | |
472 | #if LINK_SIZE == 2 |
473 | |
474 | /* Redefine LINK_SIZE as a multiple of sizeof(pcre_uchar) */ |
475 | #undef LINK_SIZE |
476 | #define LINK_SIZE 1 |
477 | |
478 | #define PUT(a,n,d) \ |
479 | (a[n] = (d)) |
480 | |
481 | #define GET(a,n) \ |
482 | (a[n]) |
483 | |
484 | #define MAX_PATTERN_SIZE (1 << 16) |
485 | |
486 | #elif LINK_SIZE == 3 || LINK_SIZE == 4 |
487 | |
488 | /* Redefine LINK_SIZE as a multiple of sizeof(pcre_uchar) */ |
489 | #undef LINK_SIZE |
490 | #define LINK_SIZE 2 |
491 | |
492 | #define PUT(a,n,d) \ |
493 | (a[n] = (d) >> 16), \ |
494 | (a[(n)+1] = (d) & 65535) |
495 | |
496 | #define GET(a,n) \ |
497 | (((a)[n] << 16) | (a)[(n)+1]) |
498 | |
499 | /* Keep it positive */ |
500 | #define MAX_PATTERN_SIZE (1 << 30) |
501 | |
502 | #else |
503 | #error LINK_SIZE must be either 2, 3, or 4 |
504 | #endif |
505 | |
506 | #elif defined COMPILE_PCRE32 |
507 | |
508 | /* Only supported LINK_SIZE is 4 */ |
509 | /* Redefine LINK_SIZE as a multiple of sizeof(pcre_uchar) */ |
510 | #undef LINK_SIZE |
511 | #define LINK_SIZE 1 |
512 | |
513 | #define PUT(a,n,d) \ |
514 | (a[n] = (d)) |
515 | |
516 | #define GET(a,n) \ |
517 | (a[n]) |
518 | |
519 | /* Keep it positive */ |
520 | #define MAX_PATTERN_SIZE (1 << 30) |
521 | |
522 | #else |
523 | #error Unsupported compiling mode |
524 | #endif /* COMPILE_PCRE[8|16|32] */ |
525 | |
526 | /* Convenience macro defined in terms of the others */ |
527 | |
528 | #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE |
529 | |
530 | |
531 | /* PCRE uses some other 2-byte quantities that do not change when the size of |
532 | offsets changes. There are used for repeat counts and for other things such as |
533 | capturing parenthesis numbers in back references. */ |
534 | |
535 | #if defined COMPILE_PCRE8 |
536 | |
537 | #define IMM2_SIZE 2 |
538 | |
539 | #define PUT2(a,n,d) \ |
540 | a[n] = (d) >> 8; \ |
541 | a[(n)+1] = (d) & 255 |
542 | |
543 | /* For reasons that I do not understand, the expression in this GET2 macro is |
544 | treated by gcc as a signed expression, even when a is declared as unsigned. It |
545 | seems that any kind of arithmetic results in a signed value. */ |
546 | |
547 | #define GET2(a,n) \ |
548 | (unsigned int)(((a)[n] << 8) | (a)[(n)+1]) |
549 | |
550 | #elif defined COMPILE_PCRE16 |
551 | |
552 | #define IMM2_SIZE 1 |
553 | |
554 | #define PUT2(a,n,d) \ |
555 | a[n] = d |
556 | |
557 | #define GET2(a,n) \ |
558 | a[n] |
559 | |
560 | #elif defined COMPILE_PCRE32 |
561 | |
562 | #define IMM2_SIZE 1 |
563 | |
564 | #define PUT2(a,n,d) \ |
565 | a[n] = d |
566 | |
567 | #define GET2(a,n) \ |
568 | a[n] |
569 | |
570 | #else |
571 | #error Unsupported compiling mode |
572 | #endif /* COMPILE_PCRE[8|16|32] */ |
573 | |
574 | #define PUT2INC(a,n,d) PUT2(a,n,d), a += IMM2_SIZE |
575 | |
576 | /* The maximum length of a MARK name is currently one data unit; it may be |
577 | changed in future to be a fixed number of bytes or to depend on LINK_SIZE. */ |
578 | |
579 | #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
580 | #define MAX_MARK ((1u << 16) - 1) |
581 | #else |
582 | #define MAX_MARK ((1u << 8) - 1) |
583 | #endif |
584 | |
585 | /* There is a proposed future special "UTF-21" mode, in which only the lowest |
586 | 21 bits of a 32-bit character are interpreted as UTF, with the remaining 11 |
587 | high-order bits available to the application for other uses. In preparation for |
588 | the future implementation of this mode, there are macros that load a data item |
589 | and, if in this special mode, mask it to 21 bits. These macros all have names |
590 | starting with UCHAR21. In all other modes, including the normal 32-bit |
591 | library, the macros all have the same simple definitions. When the new mode is |
592 | implemented, it is expected that these definitions will be varied appropriately |
593 | using #ifdef when compiling the library that supports the special mode. */ |
594 | |
595 | #define UCHAR21(eptr) (*(eptr)) |
596 | #define UCHAR21TEST(eptr) (*(eptr)) |
597 | #define UCHAR21INC(eptr) (*(eptr)++) |
598 | #define UCHAR21INCTEST(eptr) (*(eptr)++) |
599 | |
600 | /* When UTF encoding is being used, a character is no longer just a single |
601 | byte in 8-bit mode or a single short in 16-bit mode. The macros for character |
602 | handling generate simple sequences when used in the basic mode, and more |
603 | complicated ones for UTF characters. GETCHARLENTEST and other macros are not |
604 | used when UTF is not supported. To make sure they can never even appear when |
605 | UTF support is omitted, we don't even define them. */ |
606 | |
607 | #ifndef SUPPORT_UTF |
608 | |
609 | /* #define MAX_VALUE_FOR_SINGLE_CHAR */ |
610 | /* #define HAS_EXTRALEN(c) */ |
611 | /* #define GET_EXTRALEN(c) */ |
612 | /* #define NOT_FIRSTCHAR(c) */ |
613 | #define GETCHAR(c, eptr) c = *eptr; |
614 | #define GETCHARTEST(c, eptr) c = *eptr; |
615 | #define GETCHARINC(c, eptr) c = *eptr++; |
616 | #define GETCHARINCTEST(c, eptr) c = *eptr++; |
617 | #define GETCHARLEN(c, eptr, len) c = *eptr; |
618 | /* #define GETCHARLENTEST(c, eptr, len) */ |
619 | /* #define BACKCHAR(eptr) */ |
620 | /* #define FORWARDCHAR(eptr) */ |
621 | /* #define ACROSSCHAR(condition, eptr, action) */ |
622 | |
623 | #else /* SUPPORT_UTF */ |
624 | |
625 | /* Tests whether the code point needs extra characters to decode. */ |
626 | |
627 | #define (c) ((c) >= 0xc0) |
628 | |
629 | /* Base macro to pick up the remaining bytes of a UTF-8 character, not |
630 | advancing the pointer. */ |
631 | |
632 | #define GETUTF8(c, eptr) \ |
633 | { \ |
634 | if ((c & 0x20) == 0) \ |
635 | c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \ |
636 | else if ((c & 0x10) == 0) \ |
637 | c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \ |
638 | else if ((c & 0x08) == 0) \ |
639 | c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \ |
640 | ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \ |
641 | else if ((c & 0x04) == 0) \ |
642 | c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \ |
643 | ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \ |
644 | (eptr[4] & 0x3f); \ |
645 | else \ |
646 | c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \ |
647 | ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \ |
648 | ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \ |
649 | } |
650 | |
651 | /* Base macro to pick up the remaining bytes of a UTF-8 character, advancing |
652 | the pointer. */ |
653 | |
654 | #define GETUTF8INC(c, eptr) \ |
655 | { \ |
656 | if ((c & 0x20) == 0) \ |
657 | c = ((c & 0x1f) << 6) | (*eptr++ & 0x3f); \ |
658 | else if ((c & 0x10) == 0) \ |
659 | { \ |
660 | c = ((c & 0x0f) << 12) | ((*eptr & 0x3f) << 6) | (eptr[1] & 0x3f); \ |
661 | eptr += 2; \ |
662 | } \ |
663 | else if ((c & 0x08) == 0) \ |
664 | { \ |
665 | c = ((c & 0x07) << 18) | ((*eptr & 0x3f) << 12) | \ |
666 | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \ |
667 | eptr += 3; \ |
668 | } \ |
669 | else if ((c & 0x04) == 0) \ |
670 | { \ |
671 | c = ((c & 0x03) << 24) | ((*eptr & 0x3f) << 18) | \ |
672 | ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) << 6) | \ |
673 | (eptr[3] & 0x3f); \ |
674 | eptr += 4; \ |
675 | } \ |
676 | else \ |
677 | { \ |
678 | c = ((c & 0x01) << 30) | ((*eptr & 0x3f) << 24) | \ |
679 | ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) << 12) | \ |
680 | ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \ |
681 | eptr += 5; \ |
682 | } \ |
683 | } |
684 | |
685 | #if defined COMPILE_PCRE8 |
686 | |
687 | /* These macros were originally written in the form of loops that used data |
688 | from the tables whose names start with PRIV(utf8_table). They were rewritten by |
689 | a user so as not to use loops, because in some environments this gives a |
690 | significant performance advantage, and it seems never to do any harm. */ |
691 | |
692 | /* Tells the biggest code point which can be encoded as a single character. */ |
693 | |
694 | #define MAX_VALUE_FOR_SINGLE_CHAR 127 |
695 | |
696 | /* Tests whether the code point needs extra characters to decode. */ |
697 | |
698 | #define (c) ((c) >= 0xc0) |
699 | |
700 | /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. |
701 | Otherwise it has an undefined behaviour. */ |
702 | |
703 | #define (c) (PRIV(utf8_table4)[(c) & 0x3f]) |
704 | |
705 | /* Returns TRUE, if the given character is not the first character |
706 | of a UTF sequence. */ |
707 | |
708 | #define NOT_FIRSTCHAR(c) (((c) & 0xc0) == 0x80) |
709 | |
710 | /* Get the next UTF-8 character, not advancing the pointer. This is called when |
711 | we know we are in UTF-8 mode. */ |
712 | |
713 | #define GETCHAR(c, eptr) \ |
714 | c = *eptr; \ |
715 | if (c >= 0xc0) GETUTF8(c, eptr); |
716 | |
717 | /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the |
718 | pointer. */ |
719 | |
720 | #define GETCHARTEST(c, eptr) \ |
721 | c = *eptr; \ |
722 | if (utf && c >= 0xc0) GETUTF8(c, eptr); |
723 | |
724 | /* Get the next UTF-8 character, advancing the pointer. This is called when we |
725 | know we are in UTF-8 mode. */ |
726 | |
727 | #define GETCHARINC(c, eptr) \ |
728 | c = *eptr++; \ |
729 | if (c >= 0xc0) GETUTF8INC(c, eptr); |
730 | |
731 | /* Get the next character, testing for UTF-8 mode, and advancing the pointer. |
732 | This is called when we don't know if we are in UTF-8 mode. */ |
733 | |
734 | #define GETCHARINCTEST(c, eptr) \ |
735 | c = *eptr++; \ |
736 | if (utf && c >= 0xc0) GETUTF8INC(c, eptr); |
737 | |
738 | /* Base macro to pick up the remaining bytes of a UTF-8 character, not |
739 | advancing the pointer, incrementing the length. */ |
740 | |
741 | #define GETUTF8LEN(c, eptr, len) \ |
742 | { \ |
743 | if ((c & 0x20) == 0) \ |
744 | { \ |
745 | c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \ |
746 | len++; \ |
747 | } \ |
748 | else if ((c & 0x10) == 0) \ |
749 | { \ |
750 | c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \ |
751 | len += 2; \ |
752 | } \ |
753 | else if ((c & 0x08) == 0) \ |
754 | {\ |
755 | c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \ |
756 | ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \ |
757 | len += 3; \ |
758 | } \ |
759 | else if ((c & 0x04) == 0) \ |
760 | { \ |
761 | c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \ |
762 | ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \ |
763 | (eptr[4] & 0x3f); \ |
764 | len += 4; \ |
765 | } \ |
766 | else \ |
767 | {\ |
768 | c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \ |
769 | ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \ |
770 | ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \ |
771 | len += 5; \ |
772 | } \ |
773 | } |
774 | |
775 | /* Get the next UTF-8 character, not advancing the pointer, incrementing length |
776 | if there are extra bytes. This is called when we know we are in UTF-8 mode. */ |
777 | |
778 | #define GETCHARLEN(c, eptr, len) \ |
779 | c = *eptr; \ |
780 | if (c >= 0xc0) GETUTF8LEN(c, eptr, len); |
781 | |
782 | /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the |
783 | pointer, incrementing length if there are extra bytes. This is called when we |
784 | do not know if we are in UTF-8 mode. */ |
785 | |
786 | #define GETCHARLENTEST(c, eptr, len) \ |
787 | c = *eptr; \ |
788 | if (utf && c >= 0xc0) GETUTF8LEN(c, eptr, len); |
789 | |
790 | /* If the pointer is not at the start of a character, move it back until |
791 | it is. This is called only in UTF-8 mode - we don't put a test within the macro |
792 | because almost all calls are already within a block of UTF-8 only code. */ |
793 | |
794 | #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr-- |
795 | |
796 | /* Same as above, just in the other direction. */ |
797 | #define FORWARDCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr++ |
798 | |
799 | /* Same as above, but it allows a fully customizable form. */ |
800 | #define ACROSSCHAR(condition, eptr, action) \ |
801 | while((condition) && ((eptr) & 0xc0) == 0x80) action |
802 | |
803 | #elif defined COMPILE_PCRE16 |
804 | |
805 | /* Tells the biggest code point which can be encoded as a single character. */ |
806 | |
807 | #define MAX_VALUE_FOR_SINGLE_CHAR 65535 |
808 | |
809 | /* Tests whether the code point needs extra characters to decode. */ |
810 | |
811 | #define HAS_EXTRALEN(c) (((c) & 0xfc00) == 0xd800) |
812 | |
813 | /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. |
814 | Otherwise it has an undefined behaviour. */ |
815 | |
816 | #define GET_EXTRALEN(c) 1 |
817 | |
818 | /* Returns TRUE, if the given character is not the first character |
819 | of a UTF sequence. */ |
820 | |
821 | #define NOT_FIRSTCHAR(c) (((c) & 0xfc00) == 0xdc00) |
822 | |
823 | /* Base macro to pick up the low surrogate of a UTF-16 character, not |
824 | advancing the pointer. */ |
825 | |
826 | #define GETUTF16(c, eptr) \ |
827 | { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; } |
828 | |
829 | /* Get the next UTF-16 character, not advancing the pointer. This is called when |
830 | we know we are in UTF-16 mode. */ |
831 | |
832 | #define GETCHAR(c, eptr) \ |
833 | c = *eptr; \ |
834 | if ((c & 0xfc00) == 0xd800) GETUTF16(c, eptr); |
835 | |
836 | /* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the |
837 | pointer. */ |
838 | |
839 | #define GETCHARTEST(c, eptr) \ |
840 | c = *eptr; \ |
841 | if (utf && (c & 0xfc00) == 0xd800) GETUTF16(c, eptr); |
842 | |
843 | /* Base macro to pick up the low surrogate of a UTF-16 character, advancing |
844 | the pointer. */ |
845 | |
846 | #define GETUTF16INC(c, eptr) \ |
847 | { c = (((c & 0x3ff) << 10) | (*eptr++ & 0x3ff)) + 0x10000; } |
848 | |
849 | /* Get the next UTF-16 character, advancing the pointer. This is called when we |
850 | know we are in UTF-16 mode. */ |
851 | |
852 | #define GETCHARINC(c, eptr) \ |
853 | c = *eptr++; \ |
854 | if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, eptr); |
855 | |
856 | /* Get the next character, testing for UTF-16 mode, and advancing the pointer. |
857 | This is called when we don't know if we are in UTF-16 mode. */ |
858 | |
859 | #define GETCHARINCTEST(c, eptr) \ |
860 | c = *eptr++; \ |
861 | if (utf && (c & 0xfc00) == 0xd800) GETUTF16INC(c, eptr); |
862 | |
863 | /* Base macro to pick up the low surrogate of a UTF-16 character, not |
864 | advancing the pointer, incrementing the length. */ |
865 | |
866 | #define GETUTF16LEN(c, eptr, len) \ |
867 | { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; len++; } |
868 | |
869 | /* Get the next UTF-16 character, not advancing the pointer, incrementing |
870 | length if there is a low surrogate. This is called when we know we are in |
871 | UTF-16 mode. */ |
872 | |
873 | #define GETCHARLEN(c, eptr, len) \ |
874 | c = *eptr; \ |
875 | if ((c & 0xfc00) == 0xd800) GETUTF16LEN(c, eptr, len); |
876 | |
877 | /* Get the next UTF-816character, testing for UTF-16 mode, not advancing the |
878 | pointer, incrementing length if there is a low surrogate. This is called when |
879 | we do not know if we are in UTF-16 mode. */ |
880 | |
881 | #define GETCHARLENTEST(c, eptr, len) \ |
882 | c = *eptr; \ |
883 | if (utf && (c & 0xfc00) == 0xd800) GETUTF16LEN(c, eptr, len); |
884 | |
885 | /* If the pointer is not at the start of a character, move it back until |
886 | it is. This is called only in UTF-16 mode - we don't put a test within the |
887 | macro because almost all calls are already within a block of UTF-16 only |
888 | code. */ |
889 | |
890 | #define BACKCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr-- |
891 | |
892 | /* Same as above, just in the other direction. */ |
893 | #define FORWARDCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr++ |
894 | |
895 | /* Same as above, but it allows a fully customizable form. */ |
896 | #define ACROSSCHAR(condition, eptr, action) \ |
897 | if ((condition) && ((eptr) & 0xfc00) == 0xdc00) action |
898 | |
899 | #elif defined COMPILE_PCRE32 |
900 | |
901 | /* These are trivial for the 32-bit library, since all UTF-32 characters fit |
902 | into one pcre_uchar unit. */ |
903 | #define MAX_VALUE_FOR_SINGLE_CHAR (0x10ffffu) |
904 | #define HAS_EXTRALEN(c) (0) |
905 | #define GET_EXTRALEN(c) (0) |
906 | #define NOT_FIRSTCHAR(c) (0) |
907 | |
908 | /* Get the next UTF-32 character, not advancing the pointer. This is called when |
909 | we know we are in UTF-32 mode. */ |
910 | |
911 | #define GETCHAR(c, eptr) \ |
912 | c = *(eptr); |
913 | |
914 | /* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the |
915 | pointer. */ |
916 | |
917 | #define GETCHARTEST(c, eptr) \ |
918 | c = *(eptr); |
919 | |
920 | /* Get the next UTF-32 character, advancing the pointer. This is called when we |
921 | know we are in UTF-32 mode. */ |
922 | |
923 | #define GETCHARINC(c, eptr) \ |
924 | c = *((eptr)++); |
925 | |
926 | /* Get the next character, testing for UTF-32 mode, and advancing the pointer. |
927 | This is called when we don't know if we are in UTF-32 mode. */ |
928 | |
929 | #define GETCHARINCTEST(c, eptr) \ |
930 | c = *((eptr)++); |
931 | |
932 | /* Get the next UTF-32 character, not advancing the pointer, not incrementing |
933 | length (since all UTF-32 is of length 1). This is called when we know we are in |
934 | UTF-32 mode. */ |
935 | |
936 | #define GETCHARLEN(c, eptr, len) \ |
937 | GETCHAR(c, eptr) |
938 | |
939 | /* Get the next UTF-32character, testing for UTF-32 mode, not advancing the |
940 | pointer, not incrementing the length (since all UTF-32 is of length 1). |
941 | This is called when we do not know if we are in UTF-32 mode. */ |
942 | |
943 | #define GETCHARLENTEST(c, eptr, len) \ |
944 | GETCHARTEST(c, eptr) |
945 | |
946 | /* If the pointer is not at the start of a character, move it back until |
947 | it is. This is called only in UTF-32 mode - we don't put a test within the |
948 | macro because almost all calls are already within a block of UTF-32 only |
949 | code. |
950 | These are all no-ops since all UTF-32 characters fit into one pcre_uchar. */ |
951 | |
952 | #define BACKCHAR(eptr) do { } while (0) |
953 | |
954 | /* Same as above, just in the other direction. */ |
955 | #define FORWARDCHAR(eptr) do { } while (0) |
956 | |
957 | /* Same as above, but it allows a fully customizable form. */ |
958 | #define ACROSSCHAR(condition, eptr, action) do { } while (0) |
959 | |
960 | #else |
961 | #error Unsupported compiling mode |
962 | #endif /* COMPILE_PCRE[8|16|32] */ |
963 | |
964 | #endif /* SUPPORT_UTF */ |
965 | |
966 | /* Tests for Unicode horizontal and vertical whitespace characters must check a |
967 | number of different values. Using a switch statement for this generates the |
968 | fastest code (no loop, no memory access), and there are several places in the |
969 | interpreter code where this happens. In order to ensure that all the case lists |
970 | remain in step, we use macros so that there is only one place where the lists |
971 | are defined. |
972 | |
973 | These values are also required as lists in pcre_compile.c when processing \h, |
974 | \H, \v and \V in a character class. The lists are defined in pcre_tables.c, but |
975 | macros that define the values are here so that all the definitions are |
976 | together. The lists must be in ascending character order, terminated by |
977 | NOTACHAR (which is 0xffffffff). |
978 | |
979 | Any changes should ensure that the various macros are kept in step with each |
980 | other. NOTE: The values also appear in pcre_jit_compile.c. */ |
981 | |
982 | /* ------ ASCII/Unicode environments ------ */ |
983 | |
984 | #ifndef EBCDIC |
985 | |
986 | #define HSPACE_LIST \ |
987 | CHAR_HT, CHAR_SPACE, CHAR_NBSP, \ |
988 | 0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \ |
989 | 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202f, 0x205f, 0x3000, \ |
990 | NOTACHAR |
991 | |
992 | #define HSPACE_MULTIBYTE_CASES \ |
993 | case 0x1680: /* OGHAM SPACE MARK */ \ |
994 | case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ \ |
995 | case 0x2000: /* EN QUAD */ \ |
996 | case 0x2001: /* EM QUAD */ \ |
997 | case 0x2002: /* EN SPACE */ \ |
998 | case 0x2003: /* EM SPACE */ \ |
999 | case 0x2004: /* THREE-PER-EM SPACE */ \ |
1000 | case 0x2005: /* FOUR-PER-EM SPACE */ \ |
1001 | case 0x2006: /* SIX-PER-EM SPACE */ \ |
1002 | case 0x2007: /* FIGURE SPACE */ \ |
1003 | case 0x2008: /* PUNCTUATION SPACE */ \ |
1004 | case 0x2009: /* THIN SPACE */ \ |
1005 | case 0x200A: /* HAIR SPACE */ \ |
1006 | case 0x202f: /* NARROW NO-BREAK SPACE */ \ |
1007 | case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ \ |
1008 | case 0x3000 /* IDEOGRAPHIC SPACE */ |
1009 | |
1010 | #define HSPACE_BYTE_CASES \ |
1011 | case CHAR_HT: \ |
1012 | case CHAR_SPACE: \ |
1013 | case CHAR_NBSP |
1014 | |
1015 | #define HSPACE_CASES \ |
1016 | HSPACE_BYTE_CASES: \ |
1017 | HSPACE_MULTIBYTE_CASES |
1018 | |
1019 | #define VSPACE_LIST \ |
1020 | CHAR_LF, CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, 0x2028, 0x2029, NOTACHAR |
1021 | |
1022 | #define VSPACE_MULTIBYTE_CASES \ |
1023 | case 0x2028: /* LINE SEPARATOR */ \ |
1024 | case 0x2029 /* PARAGRAPH SEPARATOR */ |
1025 | |
1026 | #define VSPACE_BYTE_CASES \ |
1027 | case CHAR_LF: \ |
1028 | case CHAR_VT: \ |
1029 | case CHAR_FF: \ |
1030 | case CHAR_CR: \ |
1031 | case CHAR_NEL |
1032 | |
1033 | #define VSPACE_CASES \ |
1034 | VSPACE_BYTE_CASES: \ |
1035 | VSPACE_MULTIBYTE_CASES |
1036 | |
1037 | /* ------ EBCDIC environments ------ */ |
1038 | |
1039 | #else |
1040 | #define HSPACE_LIST CHAR_HT, CHAR_SPACE, CHAR_NBSP, NOTACHAR |
1041 | |
1042 | #define HSPACE_BYTE_CASES \ |
1043 | case CHAR_HT: \ |
1044 | case CHAR_SPACE: \ |
1045 | case CHAR_NBSP |
1046 | |
1047 | #define HSPACE_CASES HSPACE_BYTE_CASES |
1048 | |
1049 | #ifdef EBCDIC_NL25 |
1050 | #define VSPACE_LIST \ |
1051 | CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, CHAR_LF, NOTACHAR |
1052 | #else |
1053 | #define VSPACE_LIST \ |
1054 | CHAR_VT, CHAR_FF, CHAR_CR, CHAR_LF, CHAR_NEL, NOTACHAR |
1055 | #endif |
1056 | |
1057 | #define VSPACE_BYTE_CASES \ |
1058 | case CHAR_LF: \ |
1059 | case CHAR_VT: \ |
1060 | case CHAR_FF: \ |
1061 | case CHAR_CR: \ |
1062 | case CHAR_NEL |
1063 | |
1064 | #define VSPACE_CASES VSPACE_BYTE_CASES |
1065 | #endif /* EBCDIC */ |
1066 | |
1067 | /* ------ End of whitespace macros ------ */ |
1068 | |
1069 | |
1070 | |
1071 | /* Private flags containing information about the compiled regex. They used to |
1072 | live at the top end of the options word, but that got almost full, so they were |
1073 | moved to a 16-bit flags word - which got almost full, so now they are in a |
1074 | 32-bit flags word. From release 8.00, PCRE_NOPARTIAL is unused, as the |
1075 | restrictions on partial matching have been lifted. It remains for backwards |
1076 | compatibility. */ |
1077 | |
1078 | #define PCRE_MODE8 0x00000001 /* compiled in 8 bit mode */ |
1079 | #define PCRE_MODE16 0x00000002 /* compiled in 16 bit mode */ |
1080 | #define PCRE_MODE32 0x00000004 /* compiled in 32 bit mode */ |
1081 | #define PCRE_FIRSTSET 0x00000010 /* first_char is set */ |
1082 | #define PCRE_FCH_CASELESS 0x00000020 /* caseless first char */ |
1083 | #define PCRE_REQCHSET 0x00000040 /* req_byte is set */ |
1084 | #define PCRE_RCH_CASELESS 0x00000080 /* caseless requested char */ |
1085 | #define PCRE_STARTLINE 0x00000100 /* start after \n for multiline */ |
1086 | #define PCRE_NOPARTIAL 0x00000200 /* can't use partial with this regex */ |
1087 | #define PCRE_JCHANGED 0x00000400 /* j option used in regex */ |
1088 | #define PCRE_HASCRORLF 0x00000800 /* explicit \r or \n in pattern */ |
1089 | #define PCRE_HASTHEN 0x00001000 /* pattern contains (*THEN) */ |
1090 | #define PCRE_MLSET 0x00002000 /* match limit set by regex */ |
1091 | #define PCRE_RLSET 0x00004000 /* recursion limit set by regex */ |
1092 | #define PCRE_MATCH_EMPTY 0x00008000 /* pattern can match empty string */ |
1093 | |
1094 | #if defined COMPILE_PCRE8 |
1095 | #define PCRE_MODE PCRE_MODE8 |
1096 | #elif defined COMPILE_PCRE16 |
1097 | #define PCRE_MODE PCRE_MODE16 |
1098 | #elif defined COMPILE_PCRE32 |
1099 | #define PCRE_MODE PCRE_MODE32 |
1100 | #endif |
1101 | #define PCRE_MODE_MASK (PCRE_MODE8 | PCRE_MODE16 | PCRE_MODE32) |
1102 | |
1103 | /* Flags for the "extra" block produced by pcre_study(). */ |
1104 | |
1105 | #define PCRE_STUDY_MAPPED 0x0001 /* a map of starting chars exists */ |
1106 | #define PCRE_STUDY_MINLEN 0x0002 /* a minimum length field exists */ |
1107 | |
1108 | /* Masks for identifying the public options that are permitted at compile |
1109 | time, run time, or study time, respectively. */ |
1110 | |
1111 | #define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY| \ |
1112 | PCRE_NEWLINE_ANYCRLF) |
1113 | |
1114 | #define PUBLIC_COMPILE_OPTIONS \ |
1115 | (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ |
1116 | PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ |
1117 | PCRE_NO_AUTO_CAPTURE|PCRE_NO_AUTO_POSSESS| \ |
1118 | PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \ |
1119 | PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \ |
1120 | PCRE_JAVASCRIPT_COMPAT|PCRE_UCP|PCRE_NO_START_OPTIMIZE|PCRE_NEVER_UTF) |
1121 | |
1122 | #define PUBLIC_EXEC_OPTIONS \ |
1123 | (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \ |
1124 | PCRE_NO_UTF8_CHECK|PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_NEWLINE_BITS| \ |
1125 | PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE|PCRE_NO_START_OPTIMIZE) |
1126 | |
1127 | #define PUBLIC_DFA_EXEC_OPTIONS \ |
1128 | (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \ |
1129 | PCRE_NO_UTF8_CHECK|PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_SHORTEST| \ |
1130 | PCRE_DFA_RESTART|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \ |
1131 | PCRE_NO_START_OPTIMIZE) |
1132 | |
1133 | #define PUBLIC_STUDY_OPTIONS \ |
1134 | (PCRE_STUDY_JIT_COMPILE|PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE| \ |
1135 | PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE|PCRE_STUDY_EXTRA_NEEDED) |
1136 | |
1137 | #define PUBLIC_JIT_EXEC_OPTIONS \ |
1138 | (PCRE_NO_UTF8_CHECK|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|\ |
1139 | PCRE_NOTEMPTY_ATSTART|PCRE_PARTIAL_SOFT|PCRE_PARTIAL_HARD) |
1140 | |
1141 | /* Magic number to provide a small check against being handed junk. */ |
1142 | |
1143 | #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ |
1144 | |
1145 | /* This variable is used to detect a loaded regular expression |
1146 | in different endianness. */ |
1147 | |
1148 | #define REVERSED_MAGIC_NUMBER 0x45524350UL /* 'ERCP' */ |
1149 | |
1150 | /* The maximum remaining length of subject we are prepared to search for a |
1151 | req_byte match. */ |
1152 | |
1153 | #define REQ_BYTE_MAX 1000 |
1154 | |
1155 | /* Miscellaneous definitions. The #ifndef is to pacify compiler warnings in |
1156 | environments where these macros are defined elsewhere. Unfortunately, there |
1157 | is no way to do the same for the typedef. */ |
1158 | |
1159 | typedef int BOOL; |
1160 | |
1161 | #ifndef FALSE |
1162 | #define FALSE 0 |
1163 | #define TRUE 1 |
1164 | #endif |
1165 | |
1166 | /* If PCRE is to support UTF-8 on EBCDIC platforms, we cannot use normal |
1167 | character constants like '*' because the compiler would emit their EBCDIC code, |
1168 | which is different from their ASCII/UTF-8 code. Instead we define macros for |
1169 | the characters so that they always use the ASCII/UTF-8 code when UTF-8 support |
1170 | is enabled. When UTF-8 support is not enabled, the definitions use character |
1171 | literals. Both character and string versions of each character are needed, and |
1172 | there are some longer strings as well. |
1173 | |
1174 | This means that, on EBCDIC platforms, the PCRE library can handle either |
1175 | EBCDIC, or UTF-8, but not both. To support both in the same compiled library |
1176 | would need different lookups depending on whether PCRE_UTF8 was set or not. |
1177 | This would make it impossible to use characters in switch/case statements, |
1178 | which would reduce performance. For a theoretical use (which nobody has asked |
1179 | for) in a minority area (EBCDIC platforms), this is not sensible. Any |
1180 | application that did need both could compile two versions of the library, using |
1181 | macros to give the functions distinct names. */ |
1182 | |
1183 | #ifndef SUPPORT_UTF |
1184 | |
1185 | /* UTF-8 support is not enabled; use the platform-dependent character literals |
1186 | so that PCRE works in both ASCII and EBCDIC environments, but only in non-UTF |
1187 | mode. Newline characters are problematic in EBCDIC. Though it has CR and LF |
1188 | characters, a common practice has been to use its NL (0x15) character as the |
1189 | line terminator in C-like processing environments. However, sometimes the LF |
1190 | (0x25) character is used instead, according to this Unicode document: |
1191 | |
1192 | http://unicode.org/standard/reports/tr13/tr13-5.html |
1193 | |
1194 | PCRE defaults EBCDIC NL to 0x15, but has a build-time option to select 0x25 |
1195 | instead. Whichever is *not* chosen is defined as NEL. |
1196 | |
1197 | In both ASCII and EBCDIC environments, CHAR_NL and CHAR_LF are synonyms for the |
1198 | same code point. */ |
1199 | |
1200 | #ifdef EBCDIC |
1201 | |
1202 | #ifndef EBCDIC_NL25 |
1203 | #define CHAR_NL '\x15' |
1204 | #define CHAR_NEL '\x25' |
1205 | #define STR_NL "\x15" |
1206 | #define STR_NEL "\x25" |
1207 | #else |
1208 | #define CHAR_NL '\x25' |
1209 | #define CHAR_NEL '\x15' |
1210 | #define STR_NL "\x25" |
1211 | #define STR_NEL "\x15" |
1212 | #endif |
1213 | |
1214 | #define CHAR_LF CHAR_NL |
1215 | #define STR_LF STR_NL |
1216 | |
1217 | #define CHAR_ESC '\047' |
1218 | #define CHAR_DEL '\007' |
1219 | #define CHAR_NBSP '\x41' |
1220 | #define STR_ESC "\047" |
1221 | #define STR_DEL "\007" |
1222 | |
1223 | #else /* Not EBCDIC */ |
1224 | |
1225 | /* In ASCII/Unicode, linefeed is '\n' and we equate this to NL for |
1226 | compatibility. NEL is the Unicode newline character; make sure it is |
1227 | a positive value. */ |
1228 | |
1229 | #define CHAR_LF '\n' |
1230 | #define CHAR_NL CHAR_LF |
1231 | #define CHAR_NEL ((unsigned char)'\x85') |
1232 | #define CHAR_ESC '\033' |
1233 | #define CHAR_DEL '\177' |
1234 | #define CHAR_NBSP ((unsigned char)'\xa0') |
1235 | |
1236 | #define STR_LF "\n" |
1237 | #define STR_NL STR_LF |
1238 | #define STR_NEL "\x85" |
1239 | #define STR_ESC "\033" |
1240 | #define STR_DEL "\177" |
1241 | |
1242 | #endif /* EBCDIC */ |
1243 | |
1244 | /* The remaining definitions work in both environments. */ |
1245 | |
1246 | #define CHAR_NULL '\0' |
1247 | #define CHAR_HT '\t' |
1248 | #define CHAR_VT '\v' |
1249 | #define CHAR_FF '\f' |
1250 | #define CHAR_CR '\r' |
1251 | #define CHAR_BS '\b' |
1252 | #define CHAR_BEL '\a' |
1253 | |
1254 | #define CHAR_SPACE ' ' |
1255 | #define CHAR_EXCLAMATION_MARK '!' |
1256 | #define CHAR_QUOTATION_MARK '"' |
1257 | #define CHAR_NUMBER_SIGN '#' |
1258 | #define CHAR_DOLLAR_SIGN '$' |
1259 | #define CHAR_PERCENT_SIGN '%' |
1260 | #define CHAR_AMPERSAND '&' |
1261 | #define CHAR_APOSTROPHE '\'' |
1262 | #define CHAR_LEFT_PARENTHESIS '(' |
1263 | #define CHAR_RIGHT_PARENTHESIS ')' |
1264 | #define CHAR_ASTERISK '*' |
1265 | #define CHAR_PLUS '+' |
1266 | #define CHAR_COMMA ',' |
1267 | #define CHAR_MINUS '-' |
1268 | #define CHAR_DOT '.' |
1269 | #define CHAR_SLASH '/' |
1270 | #define CHAR_0 '0' |
1271 | #define CHAR_1 '1' |
1272 | #define CHAR_2 '2' |
1273 | #define CHAR_3 '3' |
1274 | #define CHAR_4 '4' |
1275 | #define CHAR_5 '5' |
1276 | #define CHAR_6 '6' |
1277 | #define CHAR_7 '7' |
1278 | #define CHAR_8 '8' |
1279 | #define CHAR_9 '9' |
1280 | #define CHAR_COLON ':' |
1281 | #define CHAR_SEMICOLON ';' |
1282 | #define CHAR_LESS_THAN_SIGN '<' |
1283 | #define CHAR_EQUALS_SIGN '=' |
1284 | #define CHAR_GREATER_THAN_SIGN '>' |
1285 | #define CHAR_QUESTION_MARK '?' |
1286 | #define CHAR_COMMERCIAL_AT '@' |
1287 | #define CHAR_A 'A' |
1288 | #define CHAR_B 'B' |
1289 | #define CHAR_C 'C' |
1290 | #define CHAR_D 'D' |
1291 | #define CHAR_E 'E' |
1292 | #define CHAR_F 'F' |
1293 | #define CHAR_G 'G' |
1294 | #define CHAR_H 'H' |
1295 | #define CHAR_I 'I' |
1296 | #define CHAR_J 'J' |
1297 | #define CHAR_K 'K' |
1298 | #define CHAR_L 'L' |
1299 | #define CHAR_M 'M' |
1300 | #define CHAR_N 'N' |
1301 | #define CHAR_O 'O' |
1302 | #define CHAR_P 'P' |
1303 | #define CHAR_Q 'Q' |
1304 | #define CHAR_R 'R' |
1305 | #define CHAR_S 'S' |
1306 | #define CHAR_T 'T' |
1307 | #define CHAR_U 'U' |
1308 | #define CHAR_V 'V' |
1309 | #define CHAR_W 'W' |
1310 | #define CHAR_X 'X' |
1311 | #define CHAR_Y 'Y' |
1312 | #define CHAR_Z 'Z' |
1313 | #define CHAR_LEFT_SQUARE_BRACKET '[' |
1314 | #define CHAR_BACKSLASH '\\' |
1315 | #define CHAR_RIGHT_SQUARE_BRACKET ']' |
1316 | #define CHAR_CIRCUMFLEX_ACCENT '^' |
1317 | #define CHAR_UNDERSCORE '_' |
1318 | #define CHAR_GRAVE_ACCENT '`' |
1319 | #define CHAR_a 'a' |
1320 | #define CHAR_b 'b' |
1321 | #define CHAR_c 'c' |
1322 | #define CHAR_d 'd' |
1323 | #define CHAR_e 'e' |
1324 | #define CHAR_f 'f' |
1325 | #define CHAR_g 'g' |
1326 | #define CHAR_h 'h' |
1327 | #define CHAR_i 'i' |
1328 | #define CHAR_j 'j' |
1329 | #define CHAR_k 'k' |
1330 | #define CHAR_l 'l' |
1331 | #define CHAR_m 'm' |
1332 | #define CHAR_n 'n' |
1333 | #define CHAR_o 'o' |
1334 | #define CHAR_p 'p' |
1335 | #define CHAR_q 'q' |
1336 | #define CHAR_r 'r' |
1337 | #define CHAR_s 's' |
1338 | #define CHAR_t 't' |
1339 | #define CHAR_u 'u' |
1340 | #define CHAR_v 'v' |
1341 | #define CHAR_w 'w' |
1342 | #define CHAR_x 'x' |
1343 | #define CHAR_y 'y' |
1344 | #define CHAR_z 'z' |
1345 | #define CHAR_LEFT_CURLY_BRACKET '{' |
1346 | #define CHAR_VERTICAL_LINE '|' |
1347 | #define CHAR_RIGHT_CURLY_BRACKET '}' |
1348 | #define CHAR_TILDE '~' |
1349 | |
1350 | #define STR_HT "\t" |
1351 | #define STR_VT "\v" |
1352 | #define STR_FF "\f" |
1353 | #define STR_CR "\r" |
1354 | #define STR_BS "\b" |
1355 | #define STR_BEL "\a" |
1356 | |
1357 | #define STR_SPACE " " |
1358 | #define STR_EXCLAMATION_MARK "!" |
1359 | #define STR_QUOTATION_MARK "\"" |
1360 | #define STR_NUMBER_SIGN "#" |
1361 | #define STR_DOLLAR_SIGN "$" |
1362 | #define STR_PERCENT_SIGN "%" |
1363 | #define STR_AMPERSAND "&" |
1364 | #define STR_APOSTROPHE "'" |
1365 | #define STR_LEFT_PARENTHESIS "(" |
1366 | #define STR_RIGHT_PARENTHESIS ")" |
1367 | #define STR_ASTERISK "*" |
1368 | #define STR_PLUS "+" |
1369 | #define STR_COMMA "," |
1370 | #define STR_MINUS "-" |
1371 | #define STR_DOT "." |
1372 | #define STR_SLASH "/" |
1373 | #define STR_0 "0" |
1374 | #define STR_1 "1" |
1375 | #define STR_2 "2" |
1376 | #define STR_3 "3" |
1377 | #define STR_4 "4" |
1378 | #define STR_5 "5" |
1379 | #define STR_6 "6" |
1380 | #define STR_7 "7" |
1381 | #define STR_8 "8" |
1382 | #define STR_9 "9" |
1383 | #define STR_COLON ":" |
1384 | #define STR_SEMICOLON ";" |
1385 | #define STR_LESS_THAN_SIGN "<" |
1386 | #define STR_EQUALS_SIGN "=" |
1387 | #define STR_GREATER_THAN_SIGN ">" |
1388 | #define STR_QUESTION_MARK "?" |
1389 | #define STR_COMMERCIAL_AT "@" |
1390 | #define STR_A "A" |
1391 | #define STR_B "B" |
1392 | #define STR_C "C" |
1393 | #define STR_D "D" |
1394 | #define STR_E "E" |
1395 | #define STR_F "F" |
1396 | #define STR_G "G" |
1397 | #define STR_H "H" |
1398 | #define STR_I "I" |
1399 | #define STR_J "J" |
1400 | #define STR_K "K" |
1401 | #define STR_L "L" |
1402 | #define STR_M "M" |
1403 | #define STR_N "N" |
1404 | #define STR_O "O" |
1405 | #define STR_P "P" |
1406 | #define STR_Q "Q" |
1407 | #define STR_R "R" |
1408 | #define STR_S "S" |
1409 | #define STR_T "T" |
1410 | #define STR_U "U" |
1411 | #define STR_V "V" |
1412 | #define STR_W "W" |
1413 | #define STR_X "X" |
1414 | #define STR_Y "Y" |
1415 | #define STR_Z "Z" |
1416 | #define STR_LEFT_SQUARE_BRACKET "[" |
1417 | #define STR_BACKSLASH "\\" |
1418 | #define STR_RIGHT_SQUARE_BRACKET "]" |
1419 | #define STR_CIRCUMFLEX_ACCENT "^" |
1420 | #define STR_UNDERSCORE "_" |
1421 | #define STR_GRAVE_ACCENT "`" |
1422 | #define STR_a "a" |
1423 | #define STR_b "b" |
1424 | #define STR_c "c" |
1425 | #define STR_d "d" |
1426 | #define STR_e "e" |
1427 | #define STR_f "f" |
1428 | #define STR_g "g" |
1429 | #define STR_h "h" |
1430 | #define STR_i "i" |
1431 | #define STR_j "j" |
1432 | #define STR_k "k" |
1433 | #define STR_l "l" |
1434 | #define STR_m "m" |
1435 | #define STR_n "n" |
1436 | #define STR_o "o" |
1437 | #define STR_p "p" |
1438 | #define STR_q "q" |
1439 | #define STR_r "r" |
1440 | #define STR_s "s" |
1441 | #define STR_t "t" |
1442 | #define STR_u "u" |
1443 | #define STR_v "v" |
1444 | #define STR_w "w" |
1445 | #define STR_x "x" |
1446 | #define STR_y "y" |
1447 | #define STR_z "z" |
1448 | #define STR_LEFT_CURLY_BRACKET "{" |
1449 | #define STR_VERTICAL_LINE "|" |
1450 | #define STR_RIGHT_CURLY_BRACKET "}" |
1451 | #define STR_TILDE "~" |
1452 | |
1453 | #define STRING_ACCEPT0 "ACCEPT\0" |
1454 | #define STRING_COMMIT0 "COMMIT\0" |
1455 | #define STRING_F0 "F\0" |
1456 | #define STRING_FAIL0 "FAIL\0" |
1457 | #define STRING_MARK0 "MARK\0" |
1458 | #define STRING_PRUNE0 "PRUNE\0" |
1459 | #define STRING_SKIP0 "SKIP\0" |
1460 | #define STRING_THEN "THEN" |
1461 | |
1462 | #define STRING_alpha0 "alpha\0" |
1463 | #define STRING_lower0 "lower\0" |
1464 | #define STRING_upper0 "upper\0" |
1465 | #define STRING_alnum0 "alnum\0" |
1466 | #define STRING_ascii0 "ascii\0" |
1467 | #define STRING_blank0 "blank\0" |
1468 | #define STRING_cntrl0 "cntrl\0" |
1469 | #define STRING_digit0 "digit\0" |
1470 | #define STRING_graph0 "graph\0" |
1471 | #define STRING_print0 "print\0" |
1472 | #define STRING_punct0 "punct\0" |
1473 | #define STRING_space0 "space\0" |
1474 | #define STRING_word0 "word\0" |
1475 | #define STRING_xdigit "xdigit" |
1476 | |
1477 | #define STRING_DEFINE "DEFINE" |
1478 | #define STRING_WEIRD_STARTWORD "[:<:]]" |
1479 | #define STRING_WEIRD_ENDWORD "[:>:]]" |
1480 | |
1481 | #define STRING_CR_RIGHTPAR "CR)" |
1482 | #define STRING_LF_RIGHTPAR "LF)" |
1483 | #define STRING_CRLF_RIGHTPAR "CRLF)" |
1484 | #define STRING_ANY_RIGHTPAR "ANY)" |
1485 | #define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)" |
1486 | #define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)" |
1487 | #define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)" |
1488 | #define STRING_UTF8_RIGHTPAR "UTF8)" |
1489 | #define STRING_UTF16_RIGHTPAR "UTF16)" |
1490 | #define STRING_UTF32_RIGHTPAR "UTF32)" |
1491 | #define STRING_UTF_RIGHTPAR "UTF)" |
1492 | #define STRING_UCP_RIGHTPAR "UCP)" |
1493 | #define STRING_NO_AUTO_POSSESS_RIGHTPAR "NO_AUTO_POSSESS)" |
1494 | #define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)" |
1495 | #define STRING_LIMIT_MATCH_EQ "LIMIT_MATCH=" |
1496 | #define STRING_LIMIT_RECURSION_EQ "LIMIT_RECURSION=" |
1497 | |
1498 | #else /* SUPPORT_UTF */ |
1499 | |
1500 | /* UTF-8 support is enabled; always use UTF-8 (=ASCII) character codes. This |
1501 | works in both modes non-EBCDIC platforms, and on EBCDIC platforms in UTF-8 mode |
1502 | only. */ |
1503 | |
1504 | #define CHAR_HT '\011' |
1505 | #define CHAR_VT '\013' |
1506 | #define CHAR_FF '\014' |
1507 | #define CHAR_CR '\015' |
1508 | #define CHAR_LF '\012' |
1509 | #define CHAR_NL CHAR_LF |
1510 | #define CHAR_NEL ((unsigned char)'\x85') |
1511 | #define CHAR_BS '\010' |
1512 | #define CHAR_BEL '\007' |
1513 | #define CHAR_ESC '\033' |
1514 | #define CHAR_DEL '\177' |
1515 | |
1516 | #define CHAR_NULL '\0' |
1517 | #define CHAR_SPACE '\040' |
1518 | #define CHAR_EXCLAMATION_MARK '\041' |
1519 | #define CHAR_QUOTATION_MARK '\042' |
1520 | #define CHAR_NUMBER_SIGN '\043' |
1521 | #define CHAR_DOLLAR_SIGN '\044' |
1522 | #define CHAR_PERCENT_SIGN '\045' |
1523 | #define CHAR_AMPERSAND '\046' |
1524 | #define CHAR_APOSTROPHE '\047' |
1525 | #define CHAR_LEFT_PARENTHESIS '\050' |
1526 | #define CHAR_RIGHT_PARENTHESIS '\051' |
1527 | #define CHAR_ASTERISK '\052' |
1528 | #define CHAR_PLUS '\053' |
1529 | #define CHAR_COMMA '\054' |
1530 | #define CHAR_MINUS '\055' |
1531 | #define CHAR_DOT '\056' |
1532 | #define CHAR_SLASH '\057' |
1533 | #define CHAR_0 '\060' |
1534 | #define CHAR_1 '\061' |
1535 | #define CHAR_2 '\062' |
1536 | #define CHAR_3 '\063' |
1537 | #define CHAR_4 '\064' |
1538 | #define CHAR_5 '\065' |
1539 | #define CHAR_6 '\066' |
1540 | #define CHAR_7 '\067' |
1541 | #define CHAR_8 '\070' |
1542 | #define CHAR_9 '\071' |
1543 | #define CHAR_COLON '\072' |
1544 | #define CHAR_SEMICOLON '\073' |
1545 | #define CHAR_LESS_THAN_SIGN '\074' |
1546 | #define CHAR_EQUALS_SIGN '\075' |
1547 | #define CHAR_GREATER_THAN_SIGN '\076' |
1548 | #define CHAR_QUESTION_MARK '\077' |
1549 | #define CHAR_COMMERCIAL_AT '\100' |
1550 | #define CHAR_A '\101' |
1551 | #define CHAR_B '\102' |
1552 | #define CHAR_C '\103' |
1553 | #define CHAR_D '\104' |
1554 | #define CHAR_E '\105' |
1555 | #define CHAR_F '\106' |
1556 | #define CHAR_G '\107' |
1557 | #define CHAR_H '\110' |
1558 | #define CHAR_I '\111' |
1559 | #define CHAR_J '\112' |
1560 | #define CHAR_K '\113' |
1561 | #define CHAR_L '\114' |
1562 | #define CHAR_M '\115' |
1563 | #define CHAR_N '\116' |
1564 | #define CHAR_O '\117' |
1565 | #define CHAR_P '\120' |
1566 | #define CHAR_Q '\121' |
1567 | #define CHAR_R '\122' |
1568 | #define CHAR_S '\123' |
1569 | #define CHAR_T '\124' |
1570 | #define CHAR_U '\125' |
1571 | #define CHAR_V '\126' |
1572 | #define CHAR_W '\127' |
1573 | #define CHAR_X '\130' |
1574 | #define CHAR_Y '\131' |
1575 | #define CHAR_Z '\132' |
1576 | #define CHAR_LEFT_SQUARE_BRACKET '\133' |
1577 | #define CHAR_BACKSLASH '\134' |
1578 | #define CHAR_RIGHT_SQUARE_BRACKET '\135' |
1579 | #define CHAR_CIRCUMFLEX_ACCENT '\136' |
1580 | #define CHAR_UNDERSCORE '\137' |
1581 | #define CHAR_GRAVE_ACCENT '\140' |
1582 | #define CHAR_a '\141' |
1583 | #define CHAR_b '\142' |
1584 | #define CHAR_c '\143' |
1585 | #define CHAR_d '\144' |
1586 | #define CHAR_e '\145' |
1587 | #define CHAR_f '\146' |
1588 | #define CHAR_g '\147' |
1589 | #define CHAR_h '\150' |
1590 | #define CHAR_i '\151' |
1591 | #define CHAR_j '\152' |
1592 | #define CHAR_k '\153' |
1593 | #define CHAR_l '\154' |
1594 | #define CHAR_m '\155' |
1595 | #define CHAR_n '\156' |
1596 | #define CHAR_o '\157' |
1597 | #define CHAR_p '\160' |
1598 | #define CHAR_q '\161' |
1599 | #define CHAR_r '\162' |
1600 | #define CHAR_s '\163' |
1601 | #define CHAR_t '\164' |
1602 | #define CHAR_u '\165' |
1603 | #define CHAR_v '\166' |
1604 | #define CHAR_w '\167' |
1605 | #define CHAR_x '\170' |
1606 | #define CHAR_y '\171' |
1607 | #define CHAR_z '\172' |
1608 | #define CHAR_LEFT_CURLY_BRACKET '\173' |
1609 | #define CHAR_VERTICAL_LINE '\174' |
1610 | #define CHAR_RIGHT_CURLY_BRACKET '\175' |
1611 | #define CHAR_TILDE '\176' |
1612 | #define CHAR_NBSP ((unsigned char)'\xa0') |
1613 | |
1614 | #define STR_HT "\011" |
1615 | #define STR_VT "\013" |
1616 | #define STR_FF "\014" |
1617 | #define STR_CR "\015" |
1618 | #define STR_NL "\012" |
1619 | #define STR_BS "\010" |
1620 | #define STR_BEL "\007" |
1621 | #define STR_ESC "\033" |
1622 | #define STR_DEL "\177" |
1623 | |
1624 | #define STR_SPACE "\040" |
1625 | #define STR_EXCLAMATION_MARK "\041" |
1626 | #define STR_QUOTATION_MARK "\042" |
1627 | #define STR_NUMBER_SIGN "\043" |
1628 | #define STR_DOLLAR_SIGN "\044" |
1629 | #define STR_PERCENT_SIGN "\045" |
1630 | #define STR_AMPERSAND "\046" |
1631 | #define STR_APOSTROPHE "\047" |
1632 | #define STR_LEFT_PARENTHESIS "\050" |
1633 | #define STR_RIGHT_PARENTHESIS "\051" |
1634 | #define STR_ASTERISK "\052" |
1635 | #define STR_PLUS "\053" |
1636 | #define STR_COMMA "\054" |
1637 | #define STR_MINUS "\055" |
1638 | #define STR_DOT "\056" |
1639 | #define STR_SLASH "\057" |
1640 | #define STR_0 "\060" |
1641 | #define STR_1 "\061" |
1642 | #define STR_2 "\062" |
1643 | #define STR_3 "\063" |
1644 | #define STR_4 "\064" |
1645 | #define STR_5 "\065" |
1646 | #define STR_6 "\066" |
1647 | #define STR_7 "\067" |
1648 | #define STR_8 "\070" |
1649 | #define STR_9 "\071" |
1650 | #define STR_COLON "\072" |
1651 | #define STR_SEMICOLON "\073" |
1652 | #define STR_LESS_THAN_SIGN "\074" |
1653 | #define STR_EQUALS_SIGN "\075" |
1654 | #define STR_GREATER_THAN_SIGN "\076" |
1655 | #define STR_QUESTION_MARK "\077" |
1656 | #define STR_COMMERCIAL_AT "\100" |
1657 | #define STR_A "\101" |
1658 | #define STR_B "\102" |
1659 | #define STR_C "\103" |
1660 | #define STR_D "\104" |
1661 | #define STR_E "\105" |
1662 | #define STR_F "\106" |
1663 | #define STR_G "\107" |
1664 | #define STR_H "\110" |
1665 | #define STR_I "\111" |
1666 | #define STR_J "\112" |
1667 | #define STR_K "\113" |
1668 | #define STR_L "\114" |
1669 | #define STR_M "\115" |
1670 | #define STR_N "\116" |
1671 | #define STR_O "\117" |
1672 | #define STR_P "\120" |
1673 | #define STR_Q "\121" |
1674 | #define STR_R "\122" |
1675 | #define STR_S "\123" |
1676 | #define STR_T "\124" |
1677 | #define STR_U "\125" |
1678 | #define STR_V "\126" |
1679 | #define STR_W "\127" |
1680 | #define STR_X "\130" |
1681 | #define STR_Y "\131" |
1682 | #define STR_Z "\132" |
1683 | #define STR_LEFT_SQUARE_BRACKET "\133" |
1684 | #define STR_BACKSLASH "\134" |
1685 | #define STR_RIGHT_SQUARE_BRACKET "\135" |
1686 | #define STR_CIRCUMFLEX_ACCENT "\136" |
1687 | #define STR_UNDERSCORE "\137" |
1688 | #define STR_GRAVE_ACCENT "\140" |
1689 | #define STR_a "\141" |
1690 | #define STR_b "\142" |
1691 | #define STR_c "\143" |
1692 | #define STR_d "\144" |
1693 | #define STR_e "\145" |
1694 | #define STR_f "\146" |
1695 | #define STR_g "\147" |
1696 | #define STR_h "\150" |
1697 | #define STR_i "\151" |
1698 | #define STR_j "\152" |
1699 | #define STR_k "\153" |
1700 | #define STR_l "\154" |
1701 | #define STR_m "\155" |
1702 | #define STR_n "\156" |
1703 | #define STR_o "\157" |
1704 | #define STR_p "\160" |
1705 | #define STR_q "\161" |
1706 | #define STR_r "\162" |
1707 | #define STR_s "\163" |
1708 | #define STR_t "\164" |
1709 | #define STR_u "\165" |
1710 | #define STR_v "\166" |
1711 | #define STR_w "\167" |
1712 | #define STR_x "\170" |
1713 | #define STR_y "\171" |
1714 | #define STR_z "\172" |
1715 | #define STR_LEFT_CURLY_BRACKET "\173" |
1716 | #define STR_VERTICAL_LINE "\174" |
1717 | #define STR_RIGHT_CURLY_BRACKET "\175" |
1718 | #define STR_TILDE "\176" |
1719 | |
1720 | #define STRING_ACCEPT0 STR_A STR_C STR_C STR_E STR_P STR_T "\0" |
1721 | #define STRING_COMMIT0 STR_C STR_O STR_M STR_M STR_I STR_T "\0" |
1722 | #define STRING_F0 STR_F "\0" |
1723 | #define STRING_FAIL0 STR_F STR_A STR_I STR_L "\0" |
1724 | #define STRING_MARK0 STR_M STR_A STR_R STR_K "\0" |
1725 | #define STRING_PRUNE0 STR_P STR_R STR_U STR_N STR_E "\0" |
1726 | #define STRING_SKIP0 STR_S STR_K STR_I STR_P "\0" |
1727 | #define STRING_THEN STR_T STR_H STR_E STR_N |
1728 | |
1729 | #define STRING_alpha0 STR_a STR_l STR_p STR_h STR_a "\0" |
1730 | #define STRING_lower0 STR_l STR_o STR_w STR_e STR_r "\0" |
1731 | #define STRING_upper0 STR_u STR_p STR_p STR_e STR_r "\0" |
1732 | #define STRING_alnum0 STR_a STR_l STR_n STR_u STR_m "\0" |
1733 | #define STRING_ascii0 STR_a STR_s STR_c STR_i STR_i "\0" |
1734 | #define STRING_blank0 STR_b STR_l STR_a STR_n STR_k "\0" |
1735 | #define STRING_cntrl0 STR_c STR_n STR_t STR_r STR_l "\0" |
1736 | #define STRING_digit0 STR_d STR_i STR_g STR_i STR_t "\0" |
1737 | #define STRING_graph0 STR_g STR_r STR_a STR_p STR_h "\0" |
1738 | #define STRING_print0 STR_p STR_r STR_i STR_n STR_t "\0" |
1739 | #define STRING_punct0 STR_p STR_u STR_n STR_c STR_t "\0" |
1740 | #define STRING_space0 STR_s STR_p STR_a STR_c STR_e "\0" |
1741 | #define STRING_word0 STR_w STR_o STR_r STR_d "\0" |
1742 | #define STRING_xdigit STR_x STR_d STR_i STR_g STR_i STR_t |
1743 | |
1744 | #define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E |
1745 | #define STRING_WEIRD_STARTWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_LESS_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET |
1746 | #define STRING_WEIRD_ENDWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_GREATER_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET |
1747 | |
1748 | #define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS |
1749 | #define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS |
1750 | #define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS |
1751 | #define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS |
1752 | #define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS |
1753 | #define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS |
1754 | #define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS |
1755 | #define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS |
1756 | #define STRING_UTF16_RIGHTPAR STR_U STR_T STR_F STR_1 STR_6 STR_RIGHT_PARENTHESIS |
1757 | #define STRING_UTF32_RIGHTPAR STR_U STR_T STR_F STR_3 STR_2 STR_RIGHT_PARENTHESIS |
1758 | #define STRING_UTF_RIGHTPAR STR_U STR_T STR_F STR_RIGHT_PARENTHESIS |
1759 | #define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS |
1760 | #define STRING_NO_AUTO_POSSESS_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_A STR_U STR_T STR_O STR_UNDERSCORE STR_P STR_O STR_S STR_S STR_E STR_S STR_S STR_RIGHT_PARENTHESIS |
1761 | #define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS |
1762 | #define STRING_LIMIT_MATCH_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_M STR_A STR_T STR_C STR_H STR_EQUALS_SIGN |
1763 | #define STRING_LIMIT_RECURSION_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_R STR_E STR_C STR_U STR_R STR_S STR_I STR_O STR_N STR_EQUALS_SIGN |
1764 | |
1765 | #endif /* SUPPORT_UTF */ |
1766 | |
1767 | /* Escape items that are just an encoding of a particular data value. */ |
1768 | |
1769 | #ifndef ESC_a |
1770 | #define ESC_a CHAR_BEL |
1771 | #endif |
1772 | |
1773 | #ifndef ESC_e |
1774 | #define ESC_e CHAR_ESC |
1775 | #endif |
1776 | |
1777 | #ifndef ESC_f |
1778 | #define ESC_f CHAR_FF |
1779 | #endif |
1780 | |
1781 | #ifndef ESC_n |
1782 | #define ESC_n CHAR_LF |
1783 | #endif |
1784 | |
1785 | #ifndef ESC_r |
1786 | #define ESC_r CHAR_CR |
1787 | #endif |
1788 | |
1789 | /* We can't officially use ESC_t because it is a POSIX reserved identifier |
1790 | (presumably because of all the others like size_t). */ |
1791 | |
1792 | #ifndef ESC_tee |
1793 | #define ESC_tee CHAR_HT |
1794 | #endif |
1795 | |
1796 | /* Codes for different types of Unicode property */ |
1797 | |
1798 | #define PT_ANY 0 /* Any property - matches all chars */ |
1799 | #define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */ |
1800 | #define PT_GC 2 /* Specified general characteristic (e.g. L) */ |
1801 | #define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */ |
1802 | #define PT_SC 4 /* Script (e.g. Han) */ |
1803 | #define PT_ALNUM 5 /* Alphanumeric - the union of L and N */ |
1804 | #define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */ |
1805 | #define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */ |
1806 | #define PT_WORD 8 /* Word - L plus N plus underscore */ |
1807 | #define PT_CLIST 9 /* Pseudo-property: match character list */ |
1808 | #define PT_UCNC 10 /* Universal Character nameable character */ |
1809 | #define PT_TABSIZE 11 /* Size of square table for autopossessify tests */ |
1810 | |
1811 | /* The following special properties are used only in XCLASS items, when POSIX |
1812 | classes are specified and PCRE_UCP is set - in other words, for Unicode |
1813 | handling of these classes. They are not available via the \p or \P escapes like |
1814 | those in the above list, and so they do not take part in the autopossessifying |
1815 | table. */ |
1816 | |
1817 | #define PT_PXGRAPH 11 /* [:graph:] - characters that mark the paper */ |
1818 | #define PT_PXPRINT 12 /* [:print:] - [:graph:] plus non-control spaces */ |
1819 | #define PT_PXPUNCT 13 /* [:punct:] - punctuation characters */ |
1820 | |
1821 | /* Flag bits and data types for the extended class (OP_XCLASS) for classes that |
1822 | contain characters with values greater than 255. */ |
1823 | |
1824 | #define XCL_NOT 0x01 /* Flag: this is a negative class */ |
1825 | #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ |
1826 | #define XCL_HASPROP 0x04 /* Flag: property checks are present. */ |
1827 | |
1828 | #define XCL_END 0 /* Marks end of individual items */ |
1829 | #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */ |
1830 | #define XCL_RANGE 2 /* A range (two multibyte chars) follows */ |
1831 | #define XCL_PROP 3 /* Unicode property (2-byte property code follows) */ |
1832 | #define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ |
1833 | |
1834 | /* These are escaped items that aren't just an encoding of a particular data |
1835 | value such as \n. They must have non-zero values, as check_escape() returns 0 |
1836 | for a data character. Also, they must appear in the same order as in the |
1837 | opcode definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it |
1838 | corresponds to "." in DOTALL mode rather than an escape sequence. It is also |
1839 | used for [^] in JavaScript compatibility mode, and for \C in non-utf mode. In |
1840 | non-DOTALL mode, "." behaves like \N. |
1841 | |
1842 | The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc. |
1843 | when PCRE_UCP is set and replacement of \d etc by \p sequences is required. |
1844 | They must be contiguous, and remain in order so that the replacements can be |
1845 | looked up from a table. |
1846 | |
1847 | Negative numbers are used to encode a backreference (\1, \2, \3, etc.) in |
1848 | check_escape(). There are two tests in the code for an escape |
1849 | greater than ESC_b and less than ESC_Z to detect the types that may be |
1850 | repeated. These are the types that consume characters. If any new escapes are |
1851 | put in between that don't consume a character, that code will have to change. |
1852 | */ |
1853 | |
1854 | enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, |
1855 | ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, |
1856 | ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, |
1857 | ESC_E, ESC_Q, ESC_g, ESC_k, |
1858 | ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu }; |
1859 | |
1860 | |
1861 | /********************** Opcode definitions ******************/ |
1862 | |
1863 | /****** NOTE NOTE NOTE ****** |
1864 | |
1865 | Starting from 1 (i.e. after OP_END), the values up to OP_EOD must correspond in |
1866 | order to the list of escapes immediately above. Furthermore, values up to |
1867 | OP_DOLLM must not be changed without adjusting the table called autoposstab in |
1868 | pcre_compile.c |
1869 | |
1870 | Whenever this list is updated, the two macro definitions that follow must be |
1871 | updated to match. The possessification table called "opcode_possessify" in |
1872 | pcre_compile.c must also be updated, and also the tables called "coptable" |
1873 | and "poptable" in pcre_dfa_exec.c. |
1874 | |
1875 | ****** NOTE NOTE NOTE ******/ |
1876 | |
1877 | |
1878 | /* The values between FIRST_AUTOTAB_OP and LAST_AUTOTAB_RIGHT_OP, inclusive, |
1879 | are used in a table for deciding whether a repeated character type can be |
1880 | auto-possessified. */ |
1881 | |
1882 | #define FIRST_AUTOTAB_OP OP_NOT_DIGIT |
1883 | #define LAST_AUTOTAB_LEFT_OP OP_EXTUNI |
1884 | #define LAST_AUTOTAB_RIGHT_OP OP_DOLLM |
1885 | |
1886 | enum { |
1887 | OP_END, /* 0 End of pattern */ |
1888 | |
1889 | /* Values corresponding to backslashed metacharacters */ |
1890 | |
1891 | OP_SOD, /* 1 Start of data: \A */ |
1892 | OP_SOM, /* 2 Start of match (subject + offset): \G */ |
1893 | OP_SET_SOM, /* 3 Set start of match (\K) */ |
1894 | OP_NOT_WORD_BOUNDARY, /* 4 \B */ |
1895 | OP_WORD_BOUNDARY, /* 5 \b */ |
1896 | OP_NOT_DIGIT, /* 6 \D */ |
1897 | OP_DIGIT, /* 7 \d */ |
1898 | OP_NOT_WHITESPACE, /* 8 \S */ |
1899 | OP_WHITESPACE, /* 9 \s */ |
1900 | OP_NOT_WORDCHAR, /* 10 \W */ |
1901 | OP_WORDCHAR, /* 11 \w */ |
1902 | |
1903 | OP_ANY, /* 12 Match any character except newline (\N) */ |
1904 | OP_ALLANY, /* 13 Match any character */ |
1905 | OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */ |
1906 | OP_NOTPROP, /* 15 \P (not Unicode property) */ |
1907 | OP_PROP, /* 16 \p (Unicode property) */ |
1908 | OP_ANYNL, /* 17 \R (any newline sequence) */ |
1909 | OP_NOT_HSPACE, /* 18 \H (not horizontal whitespace) */ |
1910 | OP_HSPACE, /* 19 \h (horizontal whitespace) */ |
1911 | OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */ |
1912 | OP_VSPACE, /* 21 \v (vertical whitespace) */ |
1913 | OP_EXTUNI, /* 22 \X (extended Unicode sequence */ |
1914 | OP_EODN, /* 23 End of data or \n at end of data (\Z) */ |
1915 | OP_EOD, /* 24 End of data (\z) */ |
1916 | |
1917 | /* Line end assertions */ |
1918 | |
1919 | OP_DOLL, /* 25 End of line - not multiline */ |
1920 | OP_DOLLM, /* 26 End of line - multiline */ |
1921 | OP_CIRC, /* 27 Start of line - not multiline */ |
1922 | OP_CIRCM, /* 28 Start of line - multiline */ |
1923 | |
1924 | /* Single characters; caseful must precede the caseless ones */ |
1925 | |
1926 | OP_CHAR, /* 29 Match one character, casefully */ |
1927 | OP_CHARI, /* 30 Match one character, caselessly */ |
1928 | OP_NOT, /* 31 Match one character, not the given one, casefully */ |
1929 | OP_NOTI, /* 32 Match one character, not the given one, caselessly */ |
1930 | |
1931 | /* The following sets of 13 opcodes must always be kept in step because |
1932 | the offset from the first one is used to generate the others. */ |
1933 | |
1934 | /* Repeated characters; caseful must precede the caseless ones */ |
1935 | |
1936 | OP_STAR, /* 33 The maximizing and minimizing versions of */ |
1937 | OP_MINSTAR, /* 34 these six opcodes must come in pairs, with */ |
1938 | OP_PLUS, /* 35 the minimizing one second. */ |
1939 | OP_MINPLUS, /* 36 */ |
1940 | OP_QUERY, /* 37 */ |
1941 | OP_MINQUERY, /* 38 */ |
1942 | |
1943 | OP_UPTO, /* 39 From 0 to n matches of one character, caseful*/ |
1944 | OP_MINUPTO, /* 40 */ |
1945 | OP_EXACT, /* 41 Exactly n matches */ |
1946 | |
1947 | OP_POSSTAR, /* 42 Possessified star, caseful */ |
1948 | OP_POSPLUS, /* 43 Possessified plus, caseful */ |
1949 | OP_POSQUERY, /* 44 Posesssified query, caseful */ |
1950 | OP_POSUPTO, /* 45 Possessified upto, caseful */ |
1951 | |
1952 | /* Repeated characters; caseless must follow the caseful ones */ |
1953 | |
1954 | OP_STARI, /* 46 */ |
1955 | OP_MINSTARI, /* 47 */ |
1956 | OP_PLUSI, /* 48 */ |
1957 | OP_MINPLUSI, /* 49 */ |
1958 | OP_QUERYI, /* 50 */ |
1959 | OP_MINQUERYI, /* 51 */ |
1960 | |
1961 | OP_UPTOI, /* 52 From 0 to n matches of one character, caseless */ |
1962 | OP_MINUPTOI, /* 53 */ |
1963 | OP_EXACTI, /* 54 */ |
1964 | |
1965 | OP_POSSTARI, /* 55 Possessified star, caseless */ |
1966 | OP_POSPLUSI, /* 56 Possessified plus, caseless */ |
1967 | OP_POSQUERYI, /* 57 Posesssified query, caseless */ |
1968 | OP_POSUPTOI, /* 58 Possessified upto, caseless */ |
1969 | |
1970 | /* The negated ones must follow the non-negated ones, and match them */ |
1971 | /* Negated repeated character, caseful; must precede the caseless ones */ |
1972 | |
1973 | OP_NOTSTAR, /* 59 The maximizing and minimizing versions of */ |
1974 | OP_NOTMINSTAR, /* 60 these six opcodes must come in pairs, with */ |
1975 | OP_NOTPLUS, /* 61 the minimizing one second. They must be in */ |
1976 | OP_NOTMINPLUS, /* 62 exactly the same order as those above. */ |
1977 | OP_NOTQUERY, /* 63 */ |
1978 | OP_NOTMINQUERY, /* 64 */ |
1979 | |
1980 | OP_NOTUPTO, /* 65 From 0 to n matches, caseful */ |
1981 | OP_NOTMINUPTO, /* 66 */ |
1982 | OP_NOTEXACT, /* 67 Exactly n matches */ |
1983 | |
1984 | OP_NOTPOSSTAR, /* 68 Possessified versions, caseful */ |
1985 | OP_NOTPOSPLUS, /* 69 */ |
1986 | OP_NOTPOSQUERY, /* 70 */ |
1987 | OP_NOTPOSUPTO, /* 71 */ |
1988 | |
1989 | /* Negated repeated character, caseless; must follow the caseful ones */ |
1990 | |
1991 | OP_NOTSTARI, /* 72 */ |
1992 | OP_NOTMINSTARI, /* 73 */ |
1993 | OP_NOTPLUSI, /* 74 */ |
1994 | OP_NOTMINPLUSI, /* 75 */ |
1995 | OP_NOTQUERYI, /* 76 */ |
1996 | OP_NOTMINQUERYI, /* 77 */ |
1997 | |
1998 | OP_NOTUPTOI, /* 78 From 0 to n matches, caseless */ |
1999 | OP_NOTMINUPTOI, /* 79 */ |
2000 | OP_NOTEXACTI, /* 80 Exactly n matches */ |
2001 | |
2002 | OP_NOTPOSSTARI, /* 81 Possessified versions, caseless */ |
2003 | OP_NOTPOSPLUSI, /* 82 */ |
2004 | OP_NOTPOSQUERYI, /* 83 */ |
2005 | OP_NOTPOSUPTOI, /* 84 */ |
2006 | |
2007 | /* Character types */ |
2008 | |
2009 | OP_TYPESTAR, /* 85 The maximizing and minimizing versions of */ |
2010 | OP_TYPEMINSTAR, /* 86 these six opcodes must come in pairs, with */ |
2011 | OP_TYPEPLUS, /* 87 the minimizing one second. These codes must */ |
2012 | OP_TYPEMINPLUS, /* 88 be in exactly the same order as those above. */ |
2013 | OP_TYPEQUERY, /* 89 */ |
2014 | OP_TYPEMINQUERY, /* 90 */ |
2015 | |
2016 | OP_TYPEUPTO, /* 91 From 0 to n matches */ |
2017 | OP_TYPEMINUPTO, /* 92 */ |
2018 | OP_TYPEEXACT, /* 93 Exactly n matches */ |
2019 | |
2020 | OP_TYPEPOSSTAR, /* 94 Possessified versions */ |
2021 | OP_TYPEPOSPLUS, /* 95 */ |
2022 | OP_TYPEPOSQUERY, /* 96 */ |
2023 | OP_TYPEPOSUPTO, /* 97 */ |
2024 | |
2025 | /* These are used for character classes and back references; only the |
2026 | first six are the same as the sets above. */ |
2027 | |
2028 | OP_CRSTAR, /* 98 The maximizing and minimizing versions of */ |
2029 | OP_CRMINSTAR, /* 99 all these opcodes must come in pairs, with */ |
2030 | OP_CRPLUS, /* 100 the minimizing one second. These codes must */ |
2031 | OP_CRMINPLUS, /* 101 be in exactly the same order as those above. */ |
2032 | OP_CRQUERY, /* 102 */ |
2033 | OP_CRMINQUERY, /* 103 */ |
2034 | |
2035 | OP_CRRANGE, /* 104 These are different to the three sets above. */ |
2036 | OP_CRMINRANGE, /* 105 */ |
2037 | |
2038 | OP_CRPOSSTAR, /* 106 Possessified versions */ |
2039 | OP_CRPOSPLUS, /* 107 */ |
2040 | OP_CRPOSQUERY, /* 108 */ |
2041 | OP_CRPOSRANGE, /* 109 */ |
2042 | |
2043 | /* End of quantifier opcodes */ |
2044 | |
2045 | OP_CLASS, /* 110 Match a character class, chars < 256 only */ |
2046 | OP_NCLASS, /* 111 Same, but the bitmap was created from a negative |
2047 | class - the difference is relevant only when a |
2048 | character > 255 is encountered. */ |
2049 | OP_XCLASS, /* 112 Extended class for handling > 255 chars within the |
2050 | class. This does both positive and negative. */ |
2051 | OP_REF, /* 113 Match a back reference, casefully */ |
2052 | OP_REFI, /* 114 Match a back reference, caselessly */ |
2053 | OP_DNREF, /* 115 Match a duplicate name backref, casefully */ |
2054 | OP_DNREFI, /* 116 Match a duplicate name backref, caselessly */ |
2055 | OP_RECURSE, /* 117 Match a numbered subpattern (possibly recursive) */ |
2056 | OP_CALLOUT, /* 118 Call out to external function if provided */ |
2057 | |
2058 | OP_ALT, /* 119 Start of alternation */ |
2059 | OP_KET, /* 120 End of group that doesn't have an unbounded repeat */ |
2060 | OP_KETRMAX, /* 121 These two must remain together and in this */ |
2061 | OP_KETRMIN, /* 122 order. They are for groups the repeat for ever. */ |
2062 | OP_KETRPOS, /* 123 Possessive unlimited repeat. */ |
2063 | |
2064 | /* The assertions must come before BRA, CBRA, ONCE, and COND, and the four |
2065 | asserts must remain in order. */ |
2066 | |
2067 | OP_REVERSE, /* 124 Move pointer back - used in lookbehind assertions */ |
2068 | OP_ASSERT, /* 125 Positive lookahead */ |
2069 | OP_ASSERT_NOT, /* 126 Negative lookahead */ |
2070 | OP_ASSERTBACK, /* 127 Positive lookbehind */ |
2071 | OP_ASSERTBACK_NOT, /* 128 Negative lookbehind */ |
2072 | |
2073 | /* ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately |
2074 | after the assertions, with ONCE first, as there's a test for >= ONCE for a |
2075 | subpattern that isn't an assertion. The POS versions must immediately follow |
2076 | the non-POS versions in each case. */ |
2077 | |
2078 | OP_ONCE, /* 129 Atomic group, contains captures */ |
2079 | OP_ONCE_NC, /* 130 Atomic group containing no captures */ |
2080 | OP_BRA, /* 131 Start of non-capturing bracket */ |
2081 | OP_BRAPOS, /* 132 Ditto, with unlimited, possessive repeat */ |
2082 | OP_CBRA, /* 133 Start of capturing bracket */ |
2083 | OP_CBRAPOS, /* 134 Ditto, with unlimited, possessive repeat */ |
2084 | OP_COND, /* 135 Conditional group */ |
2085 | |
2086 | /* These five must follow the previous five, in the same order. There's a |
2087 | check for >= SBRA to distinguish the two sets. */ |
2088 | |
2089 | OP_SBRA, /* 136 Start of non-capturing bracket, check empty */ |
2090 | OP_SBRAPOS, /* 137 Ditto, with unlimited, possessive repeat */ |
2091 | OP_SCBRA, /* 138 Start of capturing bracket, check empty */ |
2092 | OP_SCBRAPOS, /* 139 Ditto, with unlimited, possessive repeat */ |
2093 | OP_SCOND, /* 140 Conditional group, check empty */ |
2094 | |
2095 | /* The next two pairs must (respectively) be kept together. */ |
2096 | |
2097 | OP_CREF, /* 141 Used to hold a capture number as condition */ |
2098 | OP_DNCREF, /* 142 Used to point to duplicate names as a condition */ |
2099 | OP_RREF, /* 143 Used to hold a recursion number as condition */ |
2100 | OP_DNRREF, /* 144 Used to point to duplicate names as a condition */ |
2101 | OP_DEF, /* 145 The DEFINE condition */ |
2102 | |
2103 | OP_BRAZERO, /* 146 These two must remain together and in this */ |
2104 | OP_BRAMINZERO, /* 147 order. */ |
2105 | OP_BRAPOSZERO, /* 148 */ |
2106 | |
2107 | /* These are backtracking control verbs */ |
2108 | |
2109 | OP_MARK, /* 149 always has an argument */ |
2110 | OP_PRUNE, /* 150 */ |
2111 | OP_PRUNE_ARG, /* 151 same, but with argument */ |
2112 | OP_SKIP, /* 152 */ |
2113 | OP_SKIP_ARG, /* 153 same, but with argument */ |
2114 | OP_THEN, /* 154 */ |
2115 | OP_THEN_ARG, /* 155 same, but with argument */ |
2116 | OP_COMMIT, /* 156 */ |
2117 | |
2118 | /* These are forced failure and success verbs */ |
2119 | |
2120 | OP_FAIL, /* 157 */ |
2121 | OP_ACCEPT, /* 158 */ |
2122 | OP_ASSERT_ACCEPT, /* 159 Used inside assertions */ |
2123 | OP_CLOSE, /* 160 Used before OP_ACCEPT to close open captures */ |
2124 | |
2125 | /* This is used to skip a subpattern with a {0} quantifier */ |
2126 | |
2127 | OP_SKIPZERO, /* 161 */ |
2128 | |
2129 | /* This is not an opcode, but is used to check that tables indexed by opcode |
2130 | are the correct length, in order to catch updating errors - there have been |
2131 | some in the past. */ |
2132 | |
2133 | OP_TABLE_LENGTH |
2134 | }; |
2135 | |
2136 | /* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro |
2137 | definitions that follow must also be updated to match. There are also tables |
2138 | called "opcode_possessify" in pcre_compile.c and "coptable" and "poptable" in |
2139 | pcre_dfa_exec.c that must be updated. */ |
2140 | |
2141 | |
2142 | /* This macro defines textual names for all the opcodes. These are used only |
2143 | for debugging, and some of them are only partial names. The macro is referenced |
2144 | only in pcre_printint.c, which fills out the full names in many cases (and in |
2145 | some cases doesn't actually use these names at all). */ |
2146 | |
2147 | #define OP_NAME_LIST \ |
2148 | "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \ |
2149 | "\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \ |
2150 | "notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \ |
2151 | "extuni", "\\Z", "\\z", \ |
2152 | "$", "$", "^", "^", "char", "chari", "not", "noti", \ |
2153 | "*", "*?", "+", "+?", "?", "??", \ |
2154 | "{", "{", "{", \ |
2155 | "*+","++", "?+", "{", \ |
2156 | "*", "*?", "+", "+?", "?", "??", \ |
2157 | "{", "{", "{", \ |
2158 | "*+","++", "?+", "{", \ |
2159 | "*", "*?", "+", "+?", "?", "??", \ |
2160 | "{", "{", "{", \ |
2161 | "*+","++", "?+", "{", \ |
2162 | "*", "*?", "+", "+?", "?", "??", \ |
2163 | "{", "{", "{", \ |
2164 | "*+","++", "?+", "{", \ |
2165 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
2166 | "*+","++", "?+", "{", \ |
2167 | "*", "*?", "+", "+?", "?", "??", "{", "{", \ |
2168 | "*+","++", "?+", "{", \ |
2169 | "class", "nclass", "xclass", "Ref", "Refi", "DnRef", "DnRefi", \ |
2170 | "Recurse", "Callout", \ |
2171 | "Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \ |
2172 | "Reverse", "Assert", "Assert not", "AssertB", "AssertB not", \ |
2173 | "Once", "Once_NC", \ |
2174 | "Bra", "BraPos", "CBra", "CBraPos", \ |
2175 | "Cond", \ |
2176 | "SBra", "SBraPos", "SCBra", "SCBraPos", \ |
2177 | "SCond", \ |
2178 | "Cond ref", "Cond dnref", "Cond rec", "Cond dnrec", "Cond def", \ |
2179 | "Brazero", "Braminzero", "Braposzero", \ |
2180 | "*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \ |
2181 | "*THEN", "*THEN", "*COMMIT", "*FAIL", \ |
2182 | "*ACCEPT", "*ASSERT_ACCEPT", \ |
2183 | "Close", "Skip zero" |
2184 | |
2185 | |
2186 | /* This macro defines the length of fixed length operations in the compiled |
2187 | regex. The lengths are used when searching for specific things, and also in the |
2188 | debugging printing of a compiled regex. We use a macro so that it can be |
2189 | defined close to the definitions of the opcodes themselves. |
2190 | |
2191 | As things have been extended, some of these are no longer fixed lenths, but are |
2192 | minima instead. For example, the length of a single-character repeat may vary |
2193 | in UTF-8 mode. The code that uses this table must know about such things. */ |
2194 | |
2195 | #define OP_LENGTHS \ |
2196 | 1, /* End */ \ |
2197 | 1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \ |
2198 | 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \ |
2199 | 1, 1, 1, /* Any, AllAny, Anybyte */ \ |
2200 | 3, 3, /* \P, \p */ \ |
2201 | 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \ |
2202 | 1, /* \X */ \ |
2203 | 1, 1, 1, 1, 1, 1, /* \Z, \z, $, $M ^, ^M */ \ |
2204 | 2, /* Char - the minimum length */ \ |
2205 | 2, /* Chari - the minimum length */ \ |
2206 | 2, /* not */ \ |
2207 | 2, /* noti */ \ |
2208 | /* Positive single-char repeats ** These are */ \ |
2209 | 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \ |
2210 | 2+IMM2_SIZE, 2+IMM2_SIZE, /* upto, minupto ** mode */ \ |
2211 | 2+IMM2_SIZE, /* exact */ \ |
2212 | 2, 2, 2, 2+IMM2_SIZE, /* *+, ++, ?+, upto+ */ \ |
2213 | 2, 2, 2, 2, 2, 2, /* *I, *?I, +I, +?I, ?I, ??I ** UTF-8 */ \ |
2214 | 2+IMM2_SIZE, 2+IMM2_SIZE, /* upto I, minupto I */ \ |
2215 | 2+IMM2_SIZE, /* exact I */ \ |
2216 | 2, 2, 2, 2+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */ \ |
2217 | /* Negative single-char repeats - only for chars < 256 */ \ |
2218 | 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \ |
2219 | 2+IMM2_SIZE, 2+IMM2_SIZE, /* NOT upto, minupto */ \ |
2220 | 2+IMM2_SIZE, /* NOT exact */ \ |
2221 | 2, 2, 2, 2+IMM2_SIZE, /* Possessive NOT *, +, ?, upto */ \ |
2222 | 2, 2, 2, 2, 2, 2, /* NOT *I, *?I, +I, +?I, ?I, ??I */ \ |
2223 | 2+IMM2_SIZE, 2+IMM2_SIZE, /* NOT upto I, minupto I */ \ |
2224 | 2+IMM2_SIZE, /* NOT exact I */ \ |
2225 | 2, 2, 2, 2+IMM2_SIZE, /* Possessive NOT *I, +I, ?I, upto I */ \ |
2226 | /* Positive type repeats */ \ |
2227 | 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \ |
2228 | 2+IMM2_SIZE, 2+IMM2_SIZE, /* Type upto, minupto */ \ |
2229 | 2+IMM2_SIZE, /* Type exact */ \ |
2230 | 2, 2, 2, 2+IMM2_SIZE, /* Possessive *+, ++, ?+, upto+ */ \ |
2231 | /* Character class & ref repeats */ \ |
2232 | 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ |
2233 | 1+2*IMM2_SIZE, 1+2*IMM2_SIZE, /* CRRANGE, CRMINRANGE */ \ |
2234 | 1, 1, 1, 1+2*IMM2_SIZE, /* Possessive *+, ++, ?+, CRPOSRANGE */ \ |
2235 | 1+(32/sizeof(pcre_uchar)), /* CLASS */ \ |
2236 | 1+(32/sizeof(pcre_uchar)), /* NCLASS */ \ |
2237 | 0, /* XCLASS - variable length */ \ |
2238 | 1+IMM2_SIZE, /* REF */ \ |
2239 | 1+IMM2_SIZE, /* REFI */ \ |
2240 | 1+2*IMM2_SIZE, /* DNREF */ \ |
2241 | 1+2*IMM2_SIZE, /* DNREFI */ \ |
2242 | 1+LINK_SIZE, /* RECURSE */ \ |
2243 | 2+2*LINK_SIZE, /* CALLOUT */ \ |
2244 | 1+LINK_SIZE, /* Alt */ \ |
2245 | 1+LINK_SIZE, /* Ket */ \ |
2246 | 1+LINK_SIZE, /* KetRmax */ \ |
2247 | 1+LINK_SIZE, /* KetRmin */ \ |
2248 | 1+LINK_SIZE, /* KetRpos */ \ |
2249 | 1+LINK_SIZE, /* Reverse */ \ |
2250 | 1+LINK_SIZE, /* Assert */ \ |
2251 | 1+LINK_SIZE, /* Assert not */ \ |
2252 | 1+LINK_SIZE, /* Assert behind */ \ |
2253 | 1+LINK_SIZE, /* Assert behind not */ \ |
2254 | 1+LINK_SIZE, /* ONCE */ \ |
2255 | 1+LINK_SIZE, /* ONCE_NC */ \ |
2256 | 1+LINK_SIZE, /* BRA */ \ |
2257 | 1+LINK_SIZE, /* BRAPOS */ \ |
2258 | 1+LINK_SIZE+IMM2_SIZE, /* CBRA */ \ |
2259 | 1+LINK_SIZE+IMM2_SIZE, /* CBRAPOS */ \ |
2260 | 1+LINK_SIZE, /* COND */ \ |
2261 | 1+LINK_SIZE, /* SBRA */ \ |
2262 | 1+LINK_SIZE, /* SBRAPOS */ \ |
2263 | 1+LINK_SIZE+IMM2_SIZE, /* SCBRA */ \ |
2264 | 1+LINK_SIZE+IMM2_SIZE, /* SCBRAPOS */ \ |
2265 | 1+LINK_SIZE, /* SCOND */ \ |
2266 | 1+IMM2_SIZE, 1+2*IMM2_SIZE, /* CREF, DNCREF */ \ |
2267 | 1+IMM2_SIZE, 1+2*IMM2_SIZE, /* RREF, DNRREF */ \ |
2268 | 1, /* DEF */ \ |
2269 | 1, 1, 1, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ \ |
2270 | 3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \ |
2271 | 1, 3, /* SKIP, SKIP_ARG */ \ |
2272 | 1, 3, /* THEN, THEN_ARG */ \ |
2273 | 1, 1, 1, 1, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ \ |
2274 | 1+IMM2_SIZE, 1 /* CLOSE, SKIPZERO */ |
2275 | |
2276 | /* A magic value for OP_RREF to indicate the "any recursion" condition. */ |
2277 | |
2278 | #define RREF_ANY 0xffff |
2279 | |
2280 | /* Compile time error code numbers. They are given names so that they can more |
2281 | easily be tracked. When a new number is added, the table called eint in |
2282 | pcreposix.c must be updated. */ |
2283 | |
2284 | enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, |
2285 | ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, |
2286 | ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, |
2287 | ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, |
2288 | ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, |
2289 | ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, |
2290 | ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, |
2291 | ERR70, ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, |
2292 | ERR80, ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERRCOUNT }; |
2293 | |
2294 | /* JIT compiling modes. The function list is indexed by them. */ |
2295 | |
2296 | enum { JIT_COMPILE, JIT_PARTIAL_SOFT_COMPILE, JIT_PARTIAL_HARD_COMPILE, |
2297 | JIT_NUMBER_OF_COMPILE_MODES }; |
2298 | |
2299 | /* The real format of the start of the pcre block; the index of names and the |
2300 | code vector run on as long as necessary after the end. We store an explicit |
2301 | offset to the name table so that if a regex is compiled on one host, saved, and |
2302 | then run on another where the size of pointers is different, all might still |
2303 | be well. |
2304 | |
2305 | The size of the structure must be a multiple of 8 bytes. For the case of |
2306 | compiled-on-4 and run-on-8, we include an extra pointer that is always NULL so |
2307 | that there are an even number of pointers which therefore are a multiple of 8 |
2308 | bytes. |
2309 | |
2310 | It is necessary to fork the struct for the 32 bit library, since it needs to |
2311 | use pcre_uint32 for first_char and req_char. We can't put an ifdef inside the |
2312 | typedef because pcretest needs access to the struct of the 8-, 16- and 32-bit |
2313 | variants. |
2314 | |
2315 | *** WARNING *** |
2316 | When new fields are added to these structures, remember to adjust the code in |
2317 | pcre_byte_order.c that is concerned with swapping the byte order of the fields |
2318 | when a compiled regex is reloaded on a host with different endianness. |
2319 | *** WARNING *** |
2320 | There is also similar byte-flipping code in pcretest.c, which is used for |
2321 | testing the byte-flipping features. It must also be kept in step. |
2322 | *** WARNING *** |
2323 | */ |
2324 | |
2325 | typedef struct real_pcre8_or_16 { |
2326 | pcre_uint32 magic_number; |
2327 | pcre_uint32 size; /* Total that was malloced */ |
2328 | pcre_uint32 options; /* Public options */ |
2329 | pcre_uint32 flags; /* Private flags */ |
2330 | pcre_uint32 limit_match; /* Limit set from regex */ |
2331 | pcre_uint32 limit_recursion; /* Limit set from regex */ |
2332 | pcre_uint16 first_char; /* Starting character */ |
2333 | pcre_uint16 req_char; /* This character must be seen */ |
2334 | pcre_uint16 max_lookbehind; /* Longest lookbehind (characters) */ |
2335 | pcre_uint16 top_bracket; /* Highest numbered group */ |
2336 | pcre_uint16 top_backref; /* Highest numbered back reference */ |
2337 | pcre_uint16 name_table_offset; /* Offset to name table that follows */ |
2338 | pcre_uint16 name_entry_size; /* Size of any name items */ |
2339 | pcre_uint16 name_count; /* Number of name items */ |
2340 | pcre_uint16 ref_count; /* Reference count */ |
2341 | pcre_uint16 dummy1; /* To ensure size is a multiple of 8 */ |
2342 | pcre_uint16 dummy2; /* To ensure size is a multiple of 8 */ |
2343 | pcre_uint16 dummy3; /* To ensure size is a multiple of 8 */ |
2344 | const pcre_uint8 *tables; /* Pointer to tables or NULL for std */ |
2345 | void *nullpad; /* NULL padding */ |
2346 | } real_pcre8_or_16; |
2347 | |
2348 | typedef struct real_pcre8_or_16 real_pcre; |
2349 | typedef struct real_pcre8_or_16 real_pcre16; |
2350 | |
2351 | typedef struct real_pcre32 { |
2352 | pcre_uint32 magic_number; |
2353 | pcre_uint32 size; /* Total that was malloced */ |
2354 | pcre_uint32 options; /* Public options */ |
2355 | pcre_uint32 flags; /* Private flags */ |
2356 | pcre_uint32 limit_match; /* Limit set from regex */ |
2357 | pcre_uint32 limit_recursion; /* Limit set from regex */ |
2358 | pcre_uint32 first_char; /* Starting character */ |
2359 | pcre_uint32 req_char; /* This character must be seen */ |
2360 | pcre_uint16 max_lookbehind; /* Longest lookbehind (characters) */ |
2361 | pcre_uint16 top_bracket; /* Highest numbered group */ |
2362 | pcre_uint16 top_backref; /* Highest numbered back reference */ |
2363 | pcre_uint16 name_table_offset; /* Offset to name table that follows */ |
2364 | pcre_uint16 name_entry_size; /* Size of any name items */ |
2365 | pcre_uint16 name_count; /* Number of name items */ |
2366 | pcre_uint16 ref_count; /* Reference count */ |
2367 | pcre_uint16 dummy; /* To ensure size is a multiple of 8 */ |
2368 | const pcre_uint8 *tables; /* Pointer to tables or NULL for std */ |
2369 | void *nullpad; /* NULL padding */ |
2370 | } real_pcre32; |
2371 | |
2372 | #if defined COMPILE_PCRE8 |
2373 | #define REAL_PCRE real_pcre |
2374 | #elif defined COMPILE_PCRE16 |
2375 | #define REAL_PCRE real_pcre16 |
2376 | #elif defined COMPILE_PCRE32 |
2377 | #define REAL_PCRE real_pcre32 |
2378 | #endif |
2379 | |
2380 | /* Assert that the size of REAL_PCRE is divisible by 8 */ |
2381 | typedef int __assert_real_pcre_size_divisible_8[(sizeof(REAL_PCRE) % 8) == 0 ? 1 : -1]; |
2382 | |
2383 | /* Needed in pcretest to access some fields in the real_pcre* structures |
2384 | * directly. They're unified for 8/16/32 bits since the structs only differ |
2385 | * after these fields; if that ever changes, need to fork those defines into |
2386 | * 8/16 and 32 bit versions. */ |
2387 | #define REAL_PCRE_MAGIC(re) (((REAL_PCRE*)re)->magic_number) |
2388 | #define REAL_PCRE_SIZE(re) (((REAL_PCRE*)re)->size) |
2389 | #define REAL_PCRE_OPTIONS(re) (((REAL_PCRE*)re)->options) |
2390 | #define REAL_PCRE_FLAGS(re) (((REAL_PCRE*)re)->flags) |
2391 | |
2392 | /* The format of the block used to store data from pcre_study(). The same |
2393 | remark (see NOTE above) about extending this structure applies. */ |
2394 | |
2395 | typedef struct pcre_study_data { |
2396 | pcre_uint32 size; /* Total that was malloced */ |
2397 | pcre_uint32 flags; /* Private flags */ |
2398 | pcre_uint8 start_bits[32]; /* Starting char bits */ |
2399 | pcre_uint32 minlength; /* Minimum subject length */ |
2400 | } pcre_study_data; |
2401 | |
2402 | /* Structure for building a chain of open capturing subpatterns during |
2403 | compiling, so that instructions to close them can be compiled when (*ACCEPT) is |
2404 | encountered. This is also used to identify subpatterns that contain recursive |
2405 | back references to themselves, so that they can be made atomic. */ |
2406 | |
2407 | typedef struct open_capitem { |
2408 | struct open_capitem *next; /* Chain link */ |
2409 | pcre_uint16 number; /* Capture number */ |
2410 | pcre_uint16 flag; /* Set TRUE if recursive back ref */ |
2411 | } open_capitem; |
2412 | |
2413 | /* Structure for building a list of named groups during the first pass of |
2414 | compiling. */ |
2415 | |
2416 | typedef struct named_group { |
2417 | const pcre_uchar *name; /* Points to the name in the pattern */ |
2418 | int length; /* Length of the name */ |
2419 | pcre_uint32 number; /* Group number */ |
2420 | } named_group; |
2421 | |
2422 | /* Structure for passing "static" information around between the functions |
2423 | doing the compiling, so that they are thread-safe. */ |
2424 | |
2425 | typedef struct compile_data { |
2426 | const pcre_uint8 *lcc; /* Points to lower casing table */ |
2427 | const pcre_uint8 *fcc; /* Points to case-flipping table */ |
2428 | const pcre_uint8 *cbits; /* Points to character type table */ |
2429 | const pcre_uint8 *ctypes; /* Points to table of type maps */ |
2430 | const pcre_uchar *start_workspace;/* The start of working space */ |
2431 | const pcre_uchar *start_code; /* The start of the compiled code */ |
2432 | const pcre_uchar *start_pattern; /* The start of the pattern */ |
2433 | const pcre_uchar *end_pattern; /* The end of the pattern */ |
2434 | pcre_uchar *hwm; /* High watermark of workspace */ |
2435 | open_capitem *open_caps; /* Chain of open capture items */ |
2436 | named_group *named_groups; /* Points to vector in pre-compile */ |
2437 | pcre_uchar *name_table; /* The name/number table */ |
2438 | int names_found; /* Number of entries so far */ |
2439 | int name_entry_size; /* Size of each entry */ |
2440 | int named_group_list_size; /* Number of entries in the list */ |
2441 | int workspace_size; /* Size of workspace */ |
2442 | unsigned int bracount; /* Count of capturing parens as we compile */ |
2443 | int final_bracount; /* Saved value after first pass */ |
2444 | int max_lookbehind; /* Maximum lookbehind (characters) */ |
2445 | int top_backref; /* Maximum back reference */ |
2446 | unsigned int backref_map; /* Bitmap of low back refs */ |
2447 | unsigned int namedrefcount; /* Number of backreferences by name */ |
2448 | int parens_depth; /* Depth of nested parentheses */ |
2449 | int assert_depth; /* Depth of nested assertions */ |
2450 | pcre_uint32 external_options; /* External (initial) options */ |
2451 | pcre_uint32 external_flags; /* External flag bits to be set */ |
2452 | int req_varyopt; /* "After variable item" flag for reqbyte */ |
2453 | BOOL had_accept; /* (*ACCEPT) encountered */ |
2454 | BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */ |
2455 | BOOL check_lookbehind; /* Lookbehinds need later checking */ |
2456 | BOOL dupnames; /* Duplicate names exist */ |
2457 | BOOL dupgroups; /* Duplicate groups exist: (?| found */ |
2458 | BOOL iscondassert; /* Next assert is a condition */ |
2459 | int nltype; /* Newline type */ |
2460 | int nllen; /* Newline string length */ |
2461 | pcre_uchar nl[4]; /* Newline string when fixed length */ |
2462 | } compile_data; |
2463 | |
2464 | /* Structure for maintaining a chain of pointers to the currently incomplete |
2465 | branches, for testing for left recursion while compiling. */ |
2466 | |
2467 | typedef struct branch_chain { |
2468 | struct branch_chain *outer; |
2469 | pcre_uchar *current_branch; |
2470 | } branch_chain; |
2471 | |
2472 | /* Structure for mutual recursion detection. */ |
2473 | |
2474 | typedef struct recurse_check { |
2475 | struct recurse_check *prev; |
2476 | const pcre_uchar *group; |
2477 | } recurse_check; |
2478 | |
2479 | /* Structure for items in a linked list that represents an explicit recursive |
2480 | call within the pattern; used by pcre_exec(). */ |
2481 | |
2482 | typedef struct recursion_info { |
2483 | struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ |
2484 | unsigned int group_num; /* Number of group that was called */ |
2485 | int *offset_save; /* Pointer to start of saved offsets */ |
2486 | int saved_max; /* Number of saved offsets */ |
2487 | int saved_capture_last; /* Last capture number */ |
2488 | PCRE_PUCHAR subject_position; /* Position at start of recursion */ |
2489 | } recursion_info; |
2490 | |
2491 | /* A similar structure for pcre_dfa_exec(). */ |
2492 | |
2493 | typedef struct dfa_recursion_info { |
2494 | struct dfa_recursion_info *prevrec; |
2495 | int group_num; |
2496 | PCRE_PUCHAR subject_position; |
2497 | } dfa_recursion_info; |
2498 | |
2499 | /* Structure for building a chain of data for holding the values of the subject |
2500 | pointer at the start of each subpattern, so as to detect when an empty string |
2501 | has been matched by a subpattern - to break infinite loops; used by |
2502 | pcre_exec(). */ |
2503 | |
2504 | typedef struct eptrblock { |
2505 | struct eptrblock *epb_prev; |
2506 | PCRE_PUCHAR epb_saved_eptr; |
2507 | } eptrblock; |
2508 | |
2509 | |
2510 | /* Structure for passing "static" information around between the functions |
2511 | doing traditional NFA matching, so that they are thread-safe. */ |
2512 | |
2513 | typedef struct match_data { |
2514 | unsigned long int match_call_count; /* As it says */ |
2515 | unsigned long int match_limit; /* As it says */ |
2516 | unsigned long int match_limit_recursion; /* As it says */ |
2517 | int *offset_vector; /* Offset vector */ |
2518 | int offset_end; /* One past the end */ |
2519 | int offset_max; /* The maximum usable for return data */ |
2520 | int nltype; /* Newline type */ |
2521 | int nllen; /* Newline string length */ |
2522 | int name_count; /* Number of names in name table */ |
2523 | int name_entry_size; /* Size of entry in names table */ |
2524 | unsigned int skip_arg_count; /* For counting SKIP_ARGs */ |
2525 | unsigned int ignore_skip_arg; /* For re-run when SKIP arg name not found */ |
2526 | pcre_uchar *name_table; /* Table of names */ |
2527 | pcre_uchar nl[4]; /* Newline string when fixed */ |
2528 | const pcre_uint8 *lcc; /* Points to lower casing table */ |
2529 | const pcre_uint8 *fcc; /* Points to case-flipping table */ |
2530 | const pcre_uint8 *ctypes; /* Points to table of type maps */ |
2531 | BOOL notbol; /* NOTBOL flag */ |
2532 | BOOL noteol; /* NOTEOL flag */ |
2533 | BOOL utf; /* UTF-8 / UTF-16 flag */ |
2534 | BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */ |
2535 | BOOL use_ucp; /* PCRE_UCP flag */ |
2536 | BOOL endonly; /* Dollar not before final \n */ |
2537 | BOOL notempty; /* Empty string match not wanted */ |
2538 | BOOL notempty_atstart; /* Empty string match at start not wanted */ |
2539 | BOOL hitend; /* Hit the end of the subject at some point */ |
2540 | BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */ |
2541 | BOOL hasthen; /* Pattern contains (*THEN) */ |
2542 | const pcre_uchar *start_code; /* For use when recursing */ |
2543 | PCRE_PUCHAR start_subject; /* Start of the subject string */ |
2544 | PCRE_PUCHAR end_subject; /* End of the subject string */ |
2545 | PCRE_PUCHAR start_match_ptr; /* Start of matched string */ |
2546 | PCRE_PUCHAR end_match_ptr; /* Subject position at end match */ |
2547 | PCRE_PUCHAR start_used_ptr; /* Earliest consulted character */ |
2548 | int partial; /* PARTIAL options */ |
2549 | int end_offset_top; /* Highwater mark at end of match */ |
2550 | pcre_int32 capture_last; /* Most recent capture number + overflow flag */ |
2551 | int start_offset; /* The start offset value */ |
2552 | int match_function_type; /* Set for certain special calls of MATCH() */ |
2553 | eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */ |
2554 | int eptrn; /* Next free eptrblock */ |
2555 | recursion_info *recursive; /* Linked list of recursion data */ |
2556 | void *callout_data; /* To pass back to callouts */ |
2557 | const pcre_uchar *mark; /* Mark pointer to pass back on success */ |
2558 | const pcre_uchar *nomatch_mark;/* Mark pointer to pass back on failure */ |
2559 | const pcre_uchar *once_target; /* Where to back up to for atomic groups */ |
2560 | #ifdef NO_RECURSE |
2561 | void *match_frames_base; /* For remembering malloc'd frames */ |
2562 | #endif |
2563 | } match_data; |
2564 | |
2565 | /* A similar structure is used for the same purpose by the DFA matching |
2566 | functions. */ |
2567 | |
2568 | typedef struct dfa_match_data { |
2569 | const pcre_uchar *start_code; /* Start of the compiled pattern */ |
2570 | const pcre_uchar *start_subject ; /* Start of the subject string */ |
2571 | const pcre_uchar *end_subject; /* End of subject string */ |
2572 | const pcre_uchar *start_used_ptr; /* Earliest consulted character */ |
2573 | const pcre_uint8 *tables; /* Character tables */ |
2574 | int start_offset; /* The start offset value */ |
2575 | int moptions; /* Match options */ |
2576 | int poptions; /* Pattern options */ |
2577 | int nltype; /* Newline type */ |
2578 | int nllen; /* Newline string length */ |
2579 | pcre_uchar nl[4]; /* Newline string when fixed */ |
2580 | void *callout_data; /* To pass back to callouts */ |
2581 | dfa_recursion_info *recursive; /* Linked list of recursion data */ |
2582 | } dfa_match_data; |
2583 | |
2584 | /* Bit definitions for entries in the pcre_ctypes table. */ |
2585 | |
2586 | #define ctype_space 0x01 |
2587 | #define ctype_letter 0x02 |
2588 | #define ctype_digit 0x04 |
2589 | #define ctype_xdigit 0x08 |
2590 | #define ctype_word 0x10 /* alphanumeric or '_' */ |
2591 | #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ |
2592 | |
2593 | /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set |
2594 | of bits for a class map. Some classes are built by combining these tables. */ |
2595 | |
2596 | #define cbit_space 0 /* [:space:] or \s */ |
2597 | #define cbit_xdigit 32 /* [:xdigit:] */ |
2598 | #define cbit_digit 64 /* [:digit:] or \d */ |
2599 | #define cbit_upper 96 /* [:upper:] */ |
2600 | #define cbit_lower 128 /* [:lower:] */ |
2601 | #define cbit_word 160 /* [:word:] or \w */ |
2602 | #define cbit_graph 192 /* [:graph:] */ |
2603 | #define cbit_print 224 /* [:print:] */ |
2604 | #define cbit_punct 256 /* [:punct:] */ |
2605 | #define cbit_cntrl 288 /* [:cntrl:] */ |
2606 | #define cbit_length 320 /* Length of the cbits table */ |
2607 | |
2608 | /* Offsets of the various tables from the base tables pointer, and |
2609 | total length. */ |
2610 | |
2611 | #define lcc_offset 0 |
2612 | #define fcc_offset 256 |
2613 | #define cbits_offset 512 |
2614 | #define ctypes_offset (cbits_offset + cbit_length) |
2615 | #define tables_length (ctypes_offset + 256) |
2616 | |
2617 | /* Internal function and data prefixes. */ |
2618 | |
2619 | #if defined COMPILE_PCRE8 |
2620 | #ifndef PUBL |
2621 | #define PUBL(name) pcre_##name |
2622 | #endif |
2623 | #ifndef PRIV |
2624 | #define PRIV(name) _pcre_##name |
2625 | #endif |
2626 | #elif defined COMPILE_PCRE16 |
2627 | #ifndef PUBL |
2628 | #define PUBL(name) pcre16_##name |
2629 | #endif |
2630 | #ifndef PRIV |
2631 | #define PRIV(name) _pcre16_##name |
2632 | #endif |
2633 | #elif defined COMPILE_PCRE32 |
2634 | #ifndef PUBL |
2635 | #define PUBL(name) pcre32_##name |
2636 | #endif |
2637 | #ifndef PRIV |
2638 | #define PRIV(name) _pcre32_##name |
2639 | #endif |
2640 | #else |
2641 | #error Unsupported compiling mode |
2642 | #endif /* COMPILE_PCRE[8|16|32] */ |
2643 | |
2644 | /* Layout of the UCP type table that translates property names into types and |
2645 | codes. Each entry used to point directly to a name, but to reduce the number of |
2646 | relocations in shared libraries, it now has an offset into a single string |
2647 | instead. */ |
2648 | |
2649 | typedef struct { |
2650 | pcre_uint16 name_offset; |
2651 | pcre_uint16 type; |
2652 | pcre_uint16 value; |
2653 | } ucp_type_table; |
2654 | |
2655 | |
2656 | /* Internal shared data tables. These are tables that are used by more than one |
2657 | of the exported public functions. They have to be "external" in the C sense, |
2658 | but are not part of the PCRE public API. The data for these tables is in the |
2659 | pcre_tables.c module. */ |
2660 | |
2661 | #ifdef COMPILE_PCRE8 |
2662 | extern const int PRIV(utf8_table1)[]; |
2663 | extern const int PRIV(utf8_table1_size); |
2664 | extern const int PRIV(utf8_table2)[]; |
2665 | extern const int PRIV(utf8_table3)[]; |
2666 | extern const pcre_uint8 PRIV(utf8_table4)[]; |
2667 | #endif /* COMPILE_PCRE8 */ |
2668 | |
2669 | extern const char PRIV(utt_names)[]; |
2670 | extern const ucp_type_table PRIV(utt)[]; |
2671 | extern const int PRIV(utt_size); |
2672 | |
2673 | extern const pcre_uint8 PRIV(OP_lengths)[]; |
2674 | extern const pcre_uint8 PRIV(default_tables)[]; |
2675 | |
2676 | extern const pcre_uint32 PRIV(hspace_list)[]; |
2677 | extern const pcre_uint32 PRIV(vspace_list)[]; |
2678 | |
2679 | |
2680 | /* Internal shared functions. These are functions that are used by more than |
2681 | one of the exported public functions. They have to be "external" in the C |
2682 | sense, but are not part of the PCRE public API. */ |
2683 | |
2684 | /* String comparison functions. */ |
2685 | #if defined COMPILE_PCRE8 |
2686 | |
2687 | #define STRCMP_UC_UC(str1, str2) \ |
2688 | strcmp((char *)(str1), (char *)(str2)) |
2689 | #define STRCMP_UC_C8(str1, str2) \ |
2690 | strcmp((char *)(str1), (str2)) |
2691 | #define STRNCMP_UC_UC(str1, str2, num) \ |
2692 | strncmp((char *)(str1), (char *)(str2), (num)) |
2693 | #define STRNCMP_UC_C8(str1, str2, num) \ |
2694 | strncmp((char *)(str1), (str2), (num)) |
2695 | #define STRLEN_UC(str) strlen((const char *)str) |
2696 | |
2697 | #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
2698 | |
2699 | extern int PRIV(strcmp_uc_uc)(const pcre_uchar *, |
2700 | const pcre_uchar *); |
2701 | extern int PRIV(strcmp_uc_c8)(const pcre_uchar *, |
2702 | const char *); |
2703 | extern int PRIV(strncmp_uc_uc)(const pcre_uchar *, |
2704 | const pcre_uchar *, unsigned int num); |
2705 | extern int PRIV(strncmp_uc_c8)(const pcre_uchar *, |
2706 | const char *, unsigned int num); |
2707 | extern unsigned int PRIV(strlen_uc)(const pcre_uchar *str); |
2708 | |
2709 | #define STRCMP_UC_UC(str1, str2) \ |
2710 | PRIV(strcmp_uc_uc)((str1), (str2)) |
2711 | #define STRCMP_UC_C8(str1, str2) \ |
2712 | PRIV(strcmp_uc_c8)((str1), (str2)) |
2713 | #define STRNCMP_UC_UC(str1, str2, num) \ |
2714 | PRIV(strncmp_uc_uc)((str1), (str2), (num)) |
2715 | #define STRNCMP_UC_C8(str1, str2, num) \ |
2716 | PRIV(strncmp_uc_c8)((str1), (str2), (num)) |
2717 | #define STRLEN_UC(str) PRIV(strlen_uc)(str) |
2718 | |
2719 | #endif /* COMPILE_PCRE[8|16|32] */ |
2720 | |
2721 | #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16 |
2722 | |
2723 | #define STRCMP_UC_UC_TEST(str1, str2) STRCMP_UC_UC(str1, str2) |
2724 | #define STRCMP_UC_C8_TEST(str1, str2) STRCMP_UC_C8(str1, str2) |
2725 | |
2726 | #elif defined COMPILE_PCRE32 |
2727 | |
2728 | extern int PRIV(strcmp_uc_uc_utf)(const pcre_uchar *, |
2729 | const pcre_uchar *); |
2730 | extern int PRIV(strcmp_uc_c8_utf)(const pcre_uchar *, |
2731 | const char *); |
2732 | |
2733 | #define STRCMP_UC_UC_TEST(str1, str2) \ |
2734 | (utf ? PRIV(strcmp_uc_uc_utf)((str1), (str2)) : PRIV(strcmp_uc_uc)((str1), (str2))) |
2735 | #define STRCMP_UC_C8_TEST(str1, str2) \ |
2736 | (utf ? PRIV(strcmp_uc_c8_utf)((str1), (str2)) : PRIV(strcmp_uc_c8)((str1), (str2))) |
2737 | |
2738 | #endif /* COMPILE_PCRE[8|16|32] */ |
2739 | |
2740 | extern const pcre_uchar *PRIV(find_bracket)(const pcre_uchar *, BOOL, int); |
2741 | extern BOOL PRIV(is_newline)(PCRE_PUCHAR, int, PCRE_PUCHAR, |
2742 | int *, BOOL); |
2743 | extern unsigned int PRIV(ord2utf)(pcre_uint32, pcre_uchar *); |
2744 | extern int PRIV(valid_utf)(PCRE_PUCHAR, int, int *); |
2745 | extern BOOL PRIV(was_newline)(PCRE_PUCHAR, int, PCRE_PUCHAR, |
2746 | int *, BOOL); |
2747 | extern BOOL PRIV(xclass)(pcre_uint32, const pcre_uchar *, BOOL); |
2748 | |
2749 | #ifdef SUPPORT_JIT |
2750 | extern void PRIV(jit_compile)(const REAL_PCRE *, |
2751 | PUBL(extra) *, int); |
2752 | extern int PRIV(jit_exec)(const PUBL(extra) *, |
2753 | const pcre_uchar *, int, int, int, int *, int); |
2754 | extern void PRIV(jit_free)(void *); |
2755 | extern int PRIV(jit_get_size)(void *); |
2756 | extern const char* PRIV(jit_get_target)(void); |
2757 | #endif |
2758 | |
2759 | /* Unicode character database (UCD) */ |
2760 | |
2761 | typedef struct { |
2762 | pcre_uint8 script; /* ucp_Arabic, etc. */ |
2763 | pcre_uint8 chartype; /* ucp_Cc, etc. (general categories) */ |
2764 | pcre_uint8 gbprop; /* ucp_gbControl, etc. (grapheme break property) */ |
2765 | pcre_uint8 caseset; /* offset to multichar other cases or zero */ |
2766 | pcre_int32 other_case; /* offset to other case, or zero if none */ |
2767 | } ucd_record; |
2768 | |
2769 | extern const pcre_uint32 PRIV(ucd_caseless_sets)[]; |
2770 | extern const ucd_record PRIV(ucd_records)[]; |
2771 | extern const pcre_uint8 PRIV(ucd_stage1)[]; |
2772 | extern const pcre_uint16 PRIV(ucd_stage2)[]; |
2773 | extern const pcre_uint32 PRIV(ucp_gentype)[]; |
2774 | extern const pcre_uint32 PRIV(ucp_gbtable)[]; |
2775 | #ifdef COMPILE_PCRE32 |
2776 | extern const ucd_record PRIV(dummy_ucd_record)[]; |
2777 | #endif |
2778 | #ifdef SUPPORT_JIT |
2779 | extern const int PRIV(ucp_typerange)[]; |
2780 | #endif |
2781 | |
2782 | #ifdef SUPPORT_UCP |
2783 | /* UCD access macros */ |
2784 | |
2785 | #define UCD_BLOCK_SIZE 128 |
2786 | #define REAL_GET_UCD(ch) (PRIV(ucd_records) + \ |
2787 | PRIV(ucd_stage2)[PRIV(ucd_stage1)[(int)(ch) / UCD_BLOCK_SIZE] * \ |
2788 | UCD_BLOCK_SIZE + (int)(ch) % UCD_BLOCK_SIZE]) |
2789 | |
2790 | #ifdef COMPILE_PCRE32 |
2791 | #define GET_UCD(ch) ((ch > 0x10ffff)? PRIV(dummy_ucd_record) : REAL_GET_UCD(ch)) |
2792 | #else |
2793 | #define GET_UCD(ch) REAL_GET_UCD(ch) |
2794 | #endif |
2795 | |
2796 | #define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype |
2797 | #define UCD_SCRIPT(ch) GET_UCD(ch)->script |
2798 | #define UCD_CATEGORY(ch) PRIV(ucp_gentype)[UCD_CHARTYPE(ch)] |
2799 | #define UCD_GRAPHBREAK(ch) GET_UCD(ch)->gbprop |
2800 | #define UCD_CASESET(ch) GET_UCD(ch)->caseset |
2801 | #define UCD_OTHERCASE(ch) ((pcre_uint32)((int)ch + (int)(GET_UCD(ch)->other_case))) |
2802 | |
2803 | #endif /* SUPPORT_UCP */ |
2804 | |
2805 | #endif |
2806 | |
2807 | /* End of pcre_internal.h */ |
2808 | |