1 | /************************************************* |
2 | * Perl-Compatible Regular Expressions * |
3 | *************************************************/ |
4 | |
5 | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | and semantics are as close as possible to those of the Perl 5 language. |
7 | |
8 | Written by Philip Hazel |
9 | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | New API code Copyright (c) 2016-2022 University of Cambridge |
11 | |
12 | ----------------------------------------------------------------------------- |
13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions are met: |
15 | |
16 | * Redistributions of source code must retain the above copyright notice, |
17 | this list of conditions and the following disclaimer. |
18 | |
19 | * Redistributions in binary form must reproduce the above copyright |
20 | notice, this list of conditions and the following disclaimer in the |
21 | documentation and/or other materials provided with the distribution. |
22 | |
23 | * Neither the name of the University of Cambridge nor the names of its |
24 | contributors may be used to endorse or promote products derived from |
25 | this software without specific prior written permission. |
26 | |
27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | POSSIBILITY OF SUCH DAMAGE. |
38 | ----------------------------------------------------------------------------- |
39 | */ |
40 | |
41 | |
42 | /* This module contains mode-dependent macro and structure definitions. The |
43 | file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined. |
44 | These mode-dependent items are kept in a separate file so that they can also be |
45 | #included multiple times for different code unit widths by pcre2test in order |
46 | to have access to the hidden structures at all supported widths. |
47 | |
48 | Some of the mode-dependent macros are required at different widths for |
49 | different parts of the pcre2test code (in particular, the included |
50 | pcre_printint.c file). We undefine them here so that they can be re-defined for |
51 | multiple inclusions. Not all of these are used in pcre2test, but it's easier |
52 | just to undefine them all. */ |
53 | |
54 | #undef ACROSSCHAR |
55 | #undef BACKCHAR |
56 | #undef BYTES2CU |
57 | #undef CHMAX_255 |
58 | #undef CU2BYTES |
59 | #undef FORWARDCHAR |
60 | #undef FORWARDCHARTEST |
61 | #undef GET |
62 | #undef GET2 |
63 | #undef GETCHAR |
64 | #undef GETCHARINC |
65 | #undef GETCHARINCTEST |
66 | #undef GETCHARLEN |
67 | #undef GETCHARLENTEST |
68 | #undef GETCHARTEST |
69 | #undef GET_EXTRALEN |
70 | #undef HAS_EXTRALEN |
71 | #undef IMM2_SIZE |
72 | #undef MAX_255 |
73 | #undef MAX_MARK |
74 | #undef MAX_PATTERN_SIZE |
75 | #undef MAX_UTF_SINGLE_CU |
76 | #undef NOT_FIRSTCU |
77 | #undef PUT |
78 | #undef PUT2 |
79 | #undef PUT2INC |
80 | #undef PUTCHAR |
81 | #undef PUTINC |
82 | #undef TABLE_GET |
83 | |
84 | |
85 | |
86 | /* -------------------------- MACROS ----------------------------- */ |
87 | |
88 | /* PCRE keeps offsets in its compiled code as at least 16-bit quantities |
89 | (always stored in big-endian order in 8-bit mode) by default. These are used, |
90 | for example, to link from the start of a subpattern to its alternatives and its |
91 | end. The use of 16 bits per offset limits the size of an 8-bit compiled regex |
92 | to around 64K, which is big enough for almost everybody. However, I received a |
93 | request for an even bigger limit. For this reason, and also to make the code |
94 | easier to maintain, the storing and loading of offsets from the compiled code |
95 | unit string is now handled by the macros that are defined here. |
96 | |
97 | The macros are controlled by the value of LINK_SIZE. This defaults to 2, but |
98 | values of 3 or 4 are also supported. */ |
99 | |
100 | /* ------------------- 8-bit support ------------------ */ |
101 | |
102 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
103 | |
104 | #if LINK_SIZE == 2 |
105 | #define PUT(a,n,d) \ |
106 | (a[n] = (PCRE2_UCHAR)((d) >> 8)), \ |
107 | (a[(n)+1] = (PCRE2_UCHAR)((d) & 255)) |
108 | #define GET(a,n) \ |
109 | (unsigned int)(((a)[n] << 8) | (a)[(n)+1]) |
110 | #define MAX_PATTERN_SIZE (1 << 16) |
111 | |
112 | #elif LINK_SIZE == 3 |
113 | #define PUT(a,n,d) \ |
114 | (a[n] = (PCRE2_UCHAR)((d) >> 16)), \ |
115 | (a[(n)+1] = (PCRE2_UCHAR)((d) >> 8)), \ |
116 | (a[(n)+2] = (PCRE2_UCHAR)((d) & 255)) |
117 | #define GET(a,n) \ |
118 | (unsigned int)(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) |
119 | #define MAX_PATTERN_SIZE (1 << 24) |
120 | |
121 | #elif LINK_SIZE == 4 |
122 | #define PUT(a,n,d) \ |
123 | (a[n] = (PCRE2_UCHAR)((d) >> 24)), \ |
124 | (a[(n)+1] = (PCRE2_UCHAR)((d) >> 16)), \ |
125 | (a[(n)+2] = (PCRE2_UCHAR)((d) >> 8)), \ |
126 | (a[(n)+3] = (PCRE2_UCHAR)((d) & 255)) |
127 | #define GET(a,n) \ |
128 | (unsigned int)(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) |
129 | #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ |
130 | |
131 | #else |
132 | #error LINK_SIZE must be 2, 3, or 4 |
133 | #endif |
134 | |
135 | |
136 | /* ------------------- 16-bit support ------------------ */ |
137 | |
138 | #elif PCRE2_CODE_UNIT_WIDTH == 16 |
139 | |
140 | #if LINK_SIZE == 2 |
141 | #undef LINK_SIZE |
142 | #define LINK_SIZE 1 |
143 | #define PUT(a,n,d) \ |
144 | (a[n] = (PCRE2_UCHAR)(d)) |
145 | #define GET(a,n) \ |
146 | (a[n]) |
147 | #define MAX_PATTERN_SIZE (1 << 16) |
148 | |
149 | #elif LINK_SIZE == 3 || LINK_SIZE == 4 |
150 | #undef LINK_SIZE |
151 | #define LINK_SIZE 2 |
152 | #define PUT(a,n,d) \ |
153 | (a[n] = (PCRE2_UCHAR)((d) >> 16)), \ |
154 | (a[(n)+1] = (PCRE2_UCHAR)((d) & 65535)) |
155 | #define GET(a,n) \ |
156 | (unsigned int)(((a)[n] << 16) | (a)[(n)+1]) |
157 | #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ |
158 | |
159 | #else |
160 | #error LINK_SIZE must be 2, 3, or 4 |
161 | #endif |
162 | |
163 | |
164 | /* ------------------- 32-bit support ------------------ */ |
165 | |
166 | #elif PCRE2_CODE_UNIT_WIDTH == 32 |
167 | #undef LINK_SIZE |
168 | #define LINK_SIZE 1 |
169 | #define PUT(a,n,d) \ |
170 | (a[n] = (d)) |
171 | #define GET(a,n) \ |
172 | (a[n]) |
173 | #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ |
174 | |
175 | #else |
176 | #error Unsupported compiling mode |
177 | #endif |
178 | |
179 | |
180 | /* --------------- Other mode-specific macros ----------------- */ |
181 | |
182 | /* PCRE uses some other (at least) 16-bit quantities that do not change when |
183 | the size of offsets changes. There are used for repeat counts and for other |
184 | things such as capturing parenthesis numbers in back references. |
185 | |
186 | Define the number of code units required to hold a 16-bit count/offset, and |
187 | macros to load and store such a value. For reasons that I do not understand, |
188 | the expression in the 8-bit GET2 macro is treated by gcc as a signed |
189 | expression, even when a is declared as unsigned. It seems that any kind of |
190 | arithmetic results in a signed value. Hence the cast. */ |
191 | |
192 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
193 | #define IMM2_SIZE 2 |
194 | #define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1]) |
195 | #define PUT2(a,n,d) a[n] = (d) >> 8, a[(n)+1] = (d) & 255 |
196 | |
197 | #else /* Code units are 16 or 32 bits */ |
198 | #define IMM2_SIZE 1 |
199 | #define GET2(a,n) a[n] |
200 | #define PUT2(a,n,d) a[n] = d |
201 | #endif |
202 | |
203 | /* Other macros that are different for 8-bit mode. The MAX_255 macro checks |
204 | whether its argument, which is assumed to be one code unit, is less than 256. |
205 | The CHMAX_255 macro does not assume one code unit. The maximum length of a MARK |
206 | name must fit in one code unit; currently it is set to 255 or 65535. The |
207 | TABLE_GET macro is used to access elements of tables containing exactly 256 |
208 | items. Its argument is a code unit. When code points can be greater than 255, a |
209 | check is needed before accessing these tables. */ |
210 | |
211 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
212 | #define MAX_255(c) TRUE |
213 | #define MAX_MARK ((1u << 8) - 1) |
214 | #define TABLE_GET(c, table, default) ((table)[c]) |
215 | #ifdef SUPPORT_UNICODE |
216 | #define SUPPORT_WIDE_CHARS |
217 | #define CHMAX_255(c) ((c) <= 255u) |
218 | #else |
219 | #define CHMAX_255(c) TRUE |
220 | #endif /* SUPPORT_UNICODE */ |
221 | |
222 | #else /* Code units are 16 or 32 bits */ |
223 | #define CHMAX_255(c) ((c) <= 255u) |
224 | #define MAX_255(c) ((c) <= 255u) |
225 | #define MAX_MARK ((1u << 16) - 1) |
226 | #define SUPPORT_WIDE_CHARS |
227 | #define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default)) |
228 | #endif |
229 | |
230 | |
231 | /* ----------------- Character-handling macros ----------------- */ |
232 | |
233 | /* There is a proposed future special "UTF-21" mode, in which only the lowest |
234 | 21 bits of a 32-bit character are interpreted as UTF, with the remaining 11 |
235 | high-order bits available to the application for other uses. In preparation for |
236 | the future implementation of this mode, there are macros that load a data item |
237 | and, if in this special mode, mask it to 21 bits. These macros all have names |
238 | starting with UCHAR21. In all other modes, including the normal 32-bit |
239 | library, the macros all have the same simple definitions. When the new mode is |
240 | implemented, it is expected that these definitions will be varied appropriately |
241 | using #ifdef when compiling the library that supports the special mode. */ |
242 | |
243 | #define UCHAR21(eptr) (*(eptr)) |
244 | #define UCHAR21TEST(eptr) (*(eptr)) |
245 | #define UCHAR21INC(eptr) (*(eptr)++) |
246 | #define UCHAR21INCTEST(eptr) (*(eptr)++) |
247 | |
248 | /* When UTF encoding is being used, a character is no longer just a single |
249 | byte in 8-bit mode or a single short in 16-bit mode. The macros for character |
250 | handling generate simple sequences when used in the basic mode, and more |
251 | complicated ones for UTF characters. GETCHARLENTEST and other macros are not |
252 | used when UTF is not supported. To make sure they can never even appear when |
253 | UTF support is omitted, we don't even define them. */ |
254 | |
255 | #ifndef SUPPORT_UNICODE |
256 | |
257 | /* #define MAX_UTF_SINGLE_CU */ |
258 | /* #define HAS_EXTRALEN(c) */ |
259 | /* #define GET_EXTRALEN(c) */ |
260 | /* #define NOT_FIRSTCU(c) */ |
261 | #define GETCHAR(c, eptr) c = *eptr; |
262 | #define GETCHARTEST(c, eptr) c = *eptr; |
263 | #define GETCHARINC(c, eptr) c = *eptr++; |
264 | #define GETCHARINCTEST(c, eptr) c = *eptr++; |
265 | #define GETCHARLEN(c, eptr, len) c = *eptr; |
266 | #define PUTCHAR(c, p) (*p = c, 1) |
267 | /* #define GETCHARLENTEST(c, eptr, len) */ |
268 | /* #define BACKCHAR(eptr) */ |
269 | /* #define FORWARDCHAR(eptr) */ |
270 | /* #define FORWARCCHARTEST(eptr,end) */ |
271 | /* #define ACROSSCHAR(condition, eptr, action) */ |
272 | |
273 | #else /* SUPPORT_UNICODE */ |
274 | |
275 | /* ------------------- 8-bit support ------------------ */ |
276 | |
277 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
278 | #define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */ |
279 | |
280 | /* The largest UTF code point that can be encoded as a single code unit. */ |
281 | |
282 | #define MAX_UTF_SINGLE_CU 127 |
283 | |
284 | /* Tests whether the code point needs extra characters to decode. */ |
285 | |
286 | #define HAS_EXTRALEN(c) HASUTF8EXTRALEN(c) |
287 | |
288 | /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. |
289 | Otherwise it has an undefined behaviour. */ |
290 | |
291 | #define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3fu]) |
292 | |
293 | /* Returns TRUE, if the given value is not the first code unit of a UTF |
294 | sequence. */ |
295 | |
296 | #define NOT_FIRSTCU(c) (((c) & 0xc0u) == 0x80u) |
297 | |
298 | /* Get the next UTF-8 character, not advancing the pointer. This is called when |
299 | we know we are in UTF-8 mode. */ |
300 | |
301 | #define GETCHAR(c, eptr) \ |
302 | c = *eptr; \ |
303 | if (c >= 0xc0u) GETUTF8(c, eptr); |
304 | |
305 | /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the |
306 | pointer. */ |
307 | |
308 | #define GETCHARTEST(c, eptr) \ |
309 | c = *eptr; \ |
310 | if (utf && c >= 0xc0u) GETUTF8(c, eptr); |
311 | |
312 | /* Get the next UTF-8 character, advancing the pointer. This is called when we |
313 | know we are in UTF-8 mode. */ |
314 | |
315 | #define GETCHARINC(c, eptr) \ |
316 | c = *eptr++; \ |
317 | if (c >= 0xc0u) GETUTF8INC(c, eptr); |
318 | |
319 | /* Get the next character, testing for UTF-8 mode, and advancing the pointer. |
320 | This is called when we don't know if we are in UTF-8 mode. */ |
321 | |
322 | #define GETCHARINCTEST(c, eptr) \ |
323 | c = *eptr++; \ |
324 | if (utf && c >= 0xc0u) GETUTF8INC(c, eptr); |
325 | |
326 | /* Get the next UTF-8 character, not advancing the pointer, incrementing length |
327 | if there are extra bytes. This is called when we know we are in UTF-8 mode. */ |
328 | |
329 | #define GETCHARLEN(c, eptr, len) \ |
330 | c = *eptr; \ |
331 | if (c >= 0xc0u) GETUTF8LEN(c, eptr, len); |
332 | |
333 | /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the |
334 | pointer, incrementing length if there are extra bytes. This is called when we |
335 | do not know if we are in UTF-8 mode. */ |
336 | |
337 | #define GETCHARLENTEST(c, eptr, len) \ |
338 | c = *eptr; \ |
339 | if (utf && c >= 0xc0u) GETUTF8LEN(c, eptr, len); |
340 | |
341 | /* If the pointer is not at the start of a character, move it back until |
342 | it is. This is called only in UTF-8 mode - we don't put a test within the macro |
343 | because almost all calls are already within a block of UTF-8 only code. */ |
344 | |
345 | #define BACKCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr-- |
346 | |
347 | /* Same as above, just in the other direction. */ |
348 | #define FORWARDCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr++ |
349 | #define FORWARDCHARTEST(eptr,end) while(eptr < end && (*eptr & 0xc0u) == 0x80u) eptr++ |
350 | |
351 | /* Same as above, but it allows a fully customizable form. */ |
352 | #define ACROSSCHAR(condition, eptr, action) \ |
353 | while((condition) && ((*eptr) & 0xc0u) == 0x80u) action |
354 | |
355 | /* Deposit a character into memory, returning the number of code units. */ |
356 | |
357 | #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \ |
358 | PRIV(ord2utf)(c,p) : (*p = c, 1)) |
359 | |
360 | |
361 | /* ------------------- 16-bit support ------------------ */ |
362 | |
363 | #elif PCRE2_CODE_UNIT_WIDTH == 16 |
364 | #define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */ |
365 | |
366 | /* The largest UTF code point that can be encoded as a single code unit. */ |
367 | |
368 | #define MAX_UTF_SINGLE_CU 65535 |
369 | |
370 | /* Tests whether the code point needs extra characters to decode. */ |
371 | |
372 | #define (c) (((c) & 0xfc00u) == 0xd800u) |
373 | |
374 | /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. |
375 | Otherwise it has an undefined behaviour. */ |
376 | |
377 | #define (c) 1 |
378 | |
379 | /* Returns TRUE, if the given value is not the first code unit of a UTF |
380 | sequence. */ |
381 | |
382 | #define NOT_FIRSTCU(c) (((c) & 0xfc00u) == 0xdc00u) |
383 | |
384 | /* Base macro to pick up the low surrogate of a UTF-16 character, not |
385 | advancing the pointer. */ |
386 | |
387 | #define GETUTF16(c, eptr) \ |
388 | { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; } |
389 | |
390 | /* Get the next UTF-16 character, not advancing the pointer. This is called when |
391 | we know we are in UTF-16 mode. */ |
392 | |
393 | #define GETCHAR(c, eptr) \ |
394 | c = *eptr; \ |
395 | if ((c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr); |
396 | |
397 | /* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the |
398 | pointer. */ |
399 | |
400 | #define GETCHARTEST(c, eptr) \ |
401 | c = *eptr; \ |
402 | if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr); |
403 | |
404 | /* Base macro to pick up the low surrogate of a UTF-16 character, advancing |
405 | the pointer. */ |
406 | |
407 | #define GETUTF16INC(c, eptr) \ |
408 | { c = (((c & 0x3ffu) << 10) | (*eptr++ & 0x3ffu)) + 0x10000u; } |
409 | |
410 | /* Get the next UTF-16 character, advancing the pointer. This is called when we |
411 | know we are in UTF-16 mode. */ |
412 | |
413 | #define GETCHARINC(c, eptr) \ |
414 | c = *eptr++; \ |
415 | if ((c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr); |
416 | |
417 | /* Get the next character, testing for UTF-16 mode, and advancing the pointer. |
418 | This is called when we don't know if we are in UTF-16 mode. */ |
419 | |
420 | #define GETCHARINCTEST(c, eptr) \ |
421 | c = *eptr++; \ |
422 | if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr); |
423 | |
424 | /* Base macro to pick up the low surrogate of a UTF-16 character, not |
425 | advancing the pointer, incrementing the length. */ |
426 | |
427 | #define GETUTF16LEN(c, eptr, len) \ |
428 | { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; len++; } |
429 | |
430 | /* Get the next UTF-16 character, not advancing the pointer, incrementing |
431 | length if there is a low surrogate. This is called when we know we are in |
432 | UTF-16 mode. */ |
433 | |
434 | #define GETCHARLEN(c, eptr, len) \ |
435 | c = *eptr; \ |
436 | if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len); |
437 | |
438 | /* Get the next UTF-816character, testing for UTF-16 mode, not advancing the |
439 | pointer, incrementing length if there is a low surrogate. This is called when |
440 | we do not know if we are in UTF-16 mode. */ |
441 | |
442 | #define GETCHARLENTEST(c, eptr, len) \ |
443 | c = *eptr; \ |
444 | if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len); |
445 | |
446 | /* If the pointer is not at the start of a character, move it back until |
447 | it is. This is called only in UTF-16 mode - we don't put a test within the |
448 | macro because almost all calls are already within a block of UTF-16 only |
449 | code. */ |
450 | |
451 | #define BACKCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr-- |
452 | |
453 | /* Same as above, just in the other direction. */ |
454 | #define FORWARDCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr++ |
455 | #define FORWARDCHARTEST(eptr,end) if (eptr < end && (*eptr & 0xfc00u) == 0xdc00u) eptr++ |
456 | |
457 | /* Same as above, but it allows a fully customizable form. */ |
458 | #define ACROSSCHAR(condition, eptr, action) \ |
459 | if ((condition) && ((*eptr) & 0xfc00u) == 0xdc00u) action |
460 | |
461 | /* Deposit a character into memory, returning the number of code units. */ |
462 | |
463 | #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \ |
464 | PRIV(ord2utf)(c,p) : (*p = c, 1)) |
465 | |
466 | |
467 | /* ------------------- 32-bit support ------------------ */ |
468 | |
469 | #else |
470 | |
471 | /* These are trivial for the 32-bit library, since all UTF-32 characters fit |
472 | into one PCRE2_UCHAR unit. */ |
473 | |
474 | #define MAX_UTF_SINGLE_CU (0x10ffffu) |
475 | #define HAS_EXTRALEN(c) (0) |
476 | #define GET_EXTRALEN(c) (0) |
477 | #define NOT_FIRSTCU(c) (0) |
478 | |
479 | /* Get the next UTF-32 character, not advancing the pointer. This is called when |
480 | we know we are in UTF-32 mode. */ |
481 | |
482 | #define GETCHAR(c, eptr) \ |
483 | c = *(eptr); |
484 | |
485 | /* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the |
486 | pointer. */ |
487 | |
488 | #define GETCHARTEST(c, eptr) \ |
489 | c = *(eptr); |
490 | |
491 | /* Get the next UTF-32 character, advancing the pointer. This is called when we |
492 | know we are in UTF-32 mode. */ |
493 | |
494 | #define GETCHARINC(c, eptr) \ |
495 | c = *((eptr)++); |
496 | |
497 | /* Get the next character, testing for UTF-32 mode, and advancing the pointer. |
498 | This is called when we don't know if we are in UTF-32 mode. */ |
499 | |
500 | #define GETCHARINCTEST(c, eptr) \ |
501 | c = *((eptr)++); |
502 | |
503 | /* Get the next UTF-32 character, not advancing the pointer, not incrementing |
504 | length (since all UTF-32 is of length 1). This is called when we know we are in |
505 | UTF-32 mode. */ |
506 | |
507 | #define GETCHARLEN(c, eptr, len) \ |
508 | GETCHAR(c, eptr) |
509 | |
510 | /* Get the next UTF-32character, testing for UTF-32 mode, not advancing the |
511 | pointer, not incrementing the length (since all UTF-32 is of length 1). |
512 | This is called when we do not know if we are in UTF-32 mode. */ |
513 | |
514 | #define GETCHARLENTEST(c, eptr, len) \ |
515 | GETCHARTEST(c, eptr) |
516 | |
517 | /* If the pointer is not at the start of a character, move it back until |
518 | it is. This is called only in UTF-32 mode - we don't put a test within the |
519 | macro because almost all calls are already within a block of UTF-32 only |
520 | code. |
521 | |
522 | These are all no-ops since all UTF-32 characters fit into one PCRE2_UCHAR. */ |
523 | |
524 | #define BACKCHAR(eptr) do { } while (0) |
525 | |
526 | /* Same as above, just in the other direction. */ |
527 | |
528 | #define FORWARDCHAR(eptr) do { } while (0) |
529 | #define FORWARDCHARTEST(eptr,end) do { } while (0) |
530 | |
531 | /* Same as above, but it allows a fully customizable form. */ |
532 | |
533 | #define ACROSSCHAR(condition, eptr, action) do { } while (0) |
534 | |
535 | /* Deposit a character into memory, returning the number of code units. */ |
536 | |
537 | #define PUTCHAR(c, p) (*p = c, 1) |
538 | |
539 | #endif /* UTF-32 character handling */ |
540 | #endif /* SUPPORT_UNICODE */ |
541 | |
542 | |
543 | /* Mode-dependent macros that have the same definition in all modes. */ |
544 | |
545 | #define CU2BYTES(x) ((x)*((PCRE2_CODE_UNIT_WIDTH/8))) |
546 | #define BYTES2CU(x) ((x)/((PCRE2_CODE_UNIT_WIDTH/8))) |
547 | #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE |
548 | #define PUT2INC(a,n,d) PUT2(a,n,d), a += IMM2_SIZE |
549 | |
550 | |
551 | /* ----------------------- HIDDEN STRUCTURES ----------------------------- */ |
552 | |
553 | /* NOTE: All these structures *must* start with a pcre2_memctl structure. The |
554 | code that uses them is simpler because it assumes this. */ |
555 | |
556 | /* The real general context structure. At present it holds only data for custom |
557 | memory control. */ |
558 | |
559 | typedef struct pcre2_real_general_context { |
560 | pcre2_memctl memctl; |
561 | } pcre2_real_general_context; |
562 | |
563 | /* The real compile context structure */ |
564 | |
565 | typedef struct pcre2_real_compile_context { |
566 | pcre2_memctl memctl; |
567 | int (*stack_guard)(uint32_t, void *); |
568 | void *stack_guard_data; |
569 | const uint8_t *tables; |
570 | PCRE2_SIZE max_pattern_length; |
571 | uint16_t bsr_convention; |
572 | uint16_t newline_convention; |
573 | uint32_t parens_nest_limit; |
574 | uint32_t ; |
575 | } pcre2_real_compile_context; |
576 | |
577 | /* The real match context structure. */ |
578 | |
579 | typedef struct pcre2_real_match_context { |
580 | pcre2_memctl memctl; |
581 | #ifdef SUPPORT_JIT |
582 | pcre2_jit_callback jit_callback; |
583 | void *jit_callback_data; |
584 | #endif |
585 | int (*callout)(pcre2_callout_block *, void *); |
586 | void *callout_data; |
587 | int (*substitute_callout)(pcre2_substitute_callout_block *, void *); |
588 | void *substitute_callout_data; |
589 | PCRE2_SIZE offset_limit; |
590 | uint32_t heap_limit; |
591 | uint32_t match_limit; |
592 | uint32_t depth_limit; |
593 | } pcre2_real_match_context; |
594 | |
595 | /* The real convert context structure. */ |
596 | |
597 | typedef struct pcre2_real_convert_context { |
598 | pcre2_memctl memctl; |
599 | uint32_t glob_separator; |
600 | uint32_t glob_escape; |
601 | } pcre2_real_convert_context; |
602 | |
603 | /* The real compiled code structure. The type for the blocksize field is |
604 | defined specially because it is required in pcre2_serialize_decode() when |
605 | copying the size from possibly unaligned memory into a variable of the same |
606 | type. Use a macro rather than a typedef to avoid compiler warnings when this |
607 | file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the |
608 | largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit |
609 | argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field |
610 | here.) */ |
611 | |
612 | #undef CODE_BLOCKSIZE_TYPE |
613 | #define CODE_BLOCKSIZE_TYPE size_t |
614 | |
615 | #undef LOOKBEHIND_MAX |
616 | #define LOOKBEHIND_MAX UINT16_MAX |
617 | |
618 | typedef struct pcre2_real_code { |
619 | pcre2_memctl memctl; /* Memory control fields */ |
620 | const uint8_t *tables; /* The character tables */ |
621 | void *executable_jit; /* Pointer to JIT code */ |
622 | uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */ |
623 | CODE_BLOCKSIZE_TYPE blocksize; /* Total (bytes) that was malloc-ed */ |
624 | uint32_t magic_number; /* Paranoid and endianness check */ |
625 | uint32_t compile_options; /* Options passed to pcre2_compile() */ |
626 | uint32_t overall_options; /* Options after processing the pattern */ |
627 | uint32_t ; /* Taken from compile_context */ |
628 | uint32_t flags; /* Various state flags */ |
629 | uint32_t limit_heap; /* Limit set in the pattern */ |
630 | uint32_t limit_match; /* Limit set in the pattern */ |
631 | uint32_t limit_depth; /* Limit set in the pattern */ |
632 | uint32_t first_codeunit; /* Starting code unit */ |
633 | uint32_t last_codeunit; /* This codeunit must be seen */ |
634 | uint16_t bsr_convention; /* What \R matches */ |
635 | uint16_t newline_convention; /* What is a newline? */ |
636 | uint16_t max_lookbehind; /* Longest lookbehind (characters) */ |
637 | uint16_t minlength; /* Minimum length of match */ |
638 | uint16_t top_bracket; /* Highest numbered group */ |
639 | uint16_t top_backref; /* Highest numbered back reference */ |
640 | uint16_t name_entry_size; /* Size (code units) of table entries */ |
641 | uint16_t name_count; /* Number of name entries in the table */ |
642 | } pcre2_real_code; |
643 | |
644 | /* The real match data structure. Define ovector as large as it can ever |
645 | actually be so that array bound checkers don't grumble. Memory for this |
646 | structure is obtained by calling pcre2_match_data_create(), which sets the size |
647 | as the offset of ovector plus a pair of elements for each capturable string, so |
648 | the size varies from call to call. As the maximum number of capturing |
649 | subpatterns is 65535 we must allow for 65536 strings to include the overall |
650 | match. (See also the heapframe structure below.) */ |
651 | |
652 | struct heapframe; /* Forward reference */ |
653 | |
654 | typedef struct pcre2_real_match_data { |
655 | pcre2_memctl memctl; /* Memory control fields */ |
656 | const pcre2_real_code *code; /* The pattern used for the match */ |
657 | PCRE2_SPTR subject; /* The subject that was matched */ |
658 | PCRE2_SPTR mark; /* Pointer to last mark */ |
659 | struct heapframe *heapframes; /* Backtracking frames heap memory */ |
660 | PCRE2_SIZE heapframes_size; /* Malloc-ed size */ |
661 | PCRE2_SIZE leftchar; /* Offset to leftmost code unit */ |
662 | PCRE2_SIZE rightchar; /* Offset to rightmost code unit */ |
663 | PCRE2_SIZE startchar; /* Offset to starting code unit */ |
664 | uint8_t matchedby; /* Type of match (normal, JIT, DFA) */ |
665 | uint8_t flags; /* Various flags */ |
666 | uint16_t oveccount; /* Number of pairs */ |
667 | int rc; /* The return code from the match */ |
668 | PCRE2_SIZE ovector[131072]; /* Must be last in the structure */ |
669 | } pcre2_real_match_data; |
670 | |
671 | |
672 | /* ----------------------- PRIVATE STRUCTURES ----------------------------- */ |
673 | |
674 | /* These structures are not needed for pcre2test. */ |
675 | |
676 | #ifndef PCRE2_PCRE2TEST |
677 | |
678 | /* Structures for checking for mutual recursion when scanning compiled or |
679 | parsed code. */ |
680 | |
681 | typedef struct recurse_check { |
682 | struct recurse_check *prev; |
683 | PCRE2_SPTR group; |
684 | } recurse_check; |
685 | |
686 | typedef struct parsed_recurse_check { |
687 | struct parsed_recurse_check *prev; |
688 | uint32_t *groupptr; |
689 | } parsed_recurse_check; |
690 | |
691 | /* Structure for building a cache when filling in recursion offsets. */ |
692 | |
693 | typedef struct recurse_cache { |
694 | PCRE2_SPTR group; |
695 | int groupnumber; |
696 | } recurse_cache; |
697 | |
698 | /* Structure for maintaining a chain of pointers to the currently incomplete |
699 | branches, for testing for left recursion while compiling. */ |
700 | |
701 | typedef struct branch_chain { |
702 | struct branch_chain *outer; |
703 | PCRE2_UCHAR *current_branch; |
704 | } branch_chain; |
705 | |
706 | /* Structure for building a list of named groups during the first pass of |
707 | compiling. */ |
708 | |
709 | typedef struct named_group { |
710 | PCRE2_SPTR name; /* Points to the name in the pattern */ |
711 | uint32_t number; /* Group number */ |
712 | uint16_t length; /* Length of the name */ |
713 | uint16_t isdup; /* TRUE if a duplicate */ |
714 | } named_group; |
715 | |
716 | /* Structure for passing "static" information around between the functions |
717 | doing the compiling, so that they are thread-safe. */ |
718 | |
719 | typedef struct compile_block { |
720 | pcre2_real_compile_context *cx; /* Points to the compile context */ |
721 | const uint8_t *lcc; /* Points to lower casing table */ |
722 | const uint8_t *fcc; /* Points to case-flipping table */ |
723 | const uint8_t *cbits; /* Points to character type table */ |
724 | const uint8_t *ctypes; /* Points to table of type maps */ |
725 | PCRE2_SPTR start_workspace; /* The start of working space */ |
726 | PCRE2_SPTR start_code; /* The start of the compiled code */ |
727 | PCRE2_SPTR start_pattern; /* The start of the pattern */ |
728 | PCRE2_SPTR end_pattern; /* The end of the pattern */ |
729 | PCRE2_UCHAR *name_table; /* The name/number table */ |
730 | PCRE2_SIZE workspace_size; /* Size of workspace */ |
731 | PCRE2_SIZE small_ref_offset[10]; /* Offsets for \1 to \9 */ |
732 | PCRE2_SIZE erroroffset; /* Offset of error in pattern */ |
733 | uint16_t names_found; /* Number of entries so far */ |
734 | uint16_t name_entry_size; /* Size of each entry */ |
735 | uint16_t parens_depth; /* Depth of nested parentheses */ |
736 | uint16_t assert_depth; /* Depth of nested assertions */ |
737 | open_capitem *open_caps; /* Chain of open capture items */ |
738 | named_group *named_groups; /* Points to vector in pre-compile */ |
739 | uint32_t named_group_list_size; /* Number of entries in the list */ |
740 | uint32_t external_options; /* External (initial) options */ |
741 | uint32_t external_flags; /* External flag bits to be set */ |
742 | uint32_t bracount; /* Count of capturing parentheses */ |
743 | uint32_t lastcapture; /* Last capture encountered */ |
744 | uint32_t *parsed_pattern; /* Parsed pattern buffer */ |
745 | uint32_t *parsed_pattern_end; /* Parsed pattern should not get here */ |
746 | uint32_t *groupinfo; /* Group info vector */ |
747 | uint32_t top_backref; /* Maximum back reference */ |
748 | uint32_t backref_map; /* Bitmap of low back refs */ |
749 | uint32_t nltype; /* Newline type */ |
750 | uint32_t nllen; /* Newline string length */ |
751 | uint32_t class_range_start; /* Overall class range start */ |
752 | uint32_t class_range_end; /* Overall class range end */ |
753 | PCRE2_UCHAR nl[4]; /* Newline string when fixed length */ |
754 | uint32_t req_varyopt; /* "After variable item" flag for reqbyte */ |
755 | int max_lookbehind; /* Maximum lookbehind (characters) */ |
756 | BOOL had_accept; /* (*ACCEPT) encountered */ |
757 | BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */ |
758 | BOOL had_recurse; /* Had a recursion or subroutine call */ |
759 | BOOL dupnames; /* Duplicate names exist */ |
760 | } compile_block; |
761 | |
762 | /* Structure for keeping the properties of the in-memory stack used |
763 | by the JIT matcher. */ |
764 | |
765 | typedef struct pcre2_real_jit_stack { |
766 | pcre2_memctl memctl; |
767 | void* stack; |
768 | } pcre2_real_jit_stack; |
769 | |
770 | /* Structure for items in a linked list that represents an explicit recursive |
771 | call within the pattern when running pcre2_dfa_match(). */ |
772 | |
773 | typedef struct dfa_recursion_info { |
774 | struct dfa_recursion_info *prevrec; |
775 | PCRE2_SPTR subject_position; |
776 | uint32_t group_num; |
777 | } dfa_recursion_info; |
778 | |
779 | /* Structure for "stack" frames that are used for remembering backtracking |
780 | positions during matching. As these are used in a vector, with the ovector item |
781 | being extended, the size of the structure must be a multiple of PCRE2_SIZE. The |
782 | only way to check this at compile time is to force an error by generating an |
783 | array with a negative size. By putting this in a typedef (which is never used), |
784 | we don't generate any code when all is well. */ |
785 | |
786 | typedef struct heapframe { |
787 | |
788 | /* The first set of fields are variables that have to be preserved over calls |
789 | to RRMATCH(), but which do not need to be copied to new frames. */ |
790 | |
791 | PCRE2_SPTR ecode; /* The current position in the pattern */ |
792 | PCRE2_SPTR temp_sptr[2]; /* Used for short-term PCRE_SPTR values */ |
793 | PCRE2_SIZE length; /* Used for character, string, or code lengths */ |
794 | PCRE2_SIZE back_frame; /* Amount to subtract on RRETURN */ |
795 | PCRE2_SIZE temp_size; /* Used for short-term PCRE2_SIZE values */ |
796 | uint32_t rdepth; /* "Recursion" depth */ |
797 | uint32_t group_frame_type; /* Type information for group frames */ |
798 | uint32_t temp_32[4]; /* Used for short-term 32-bit or BOOL values */ |
799 | uint8_t return_id; /* Where to go on in internal "return" */ |
800 | uint8_t op; /* Processing opcode */ |
801 | |
802 | /* At this point, the structure is 16-bit aligned. On most architectures |
803 | the alignment requirement for a pointer will ensure that the eptr field below |
804 | is 32-bit or 64-bit aligned. However, on m68k it is fine to have a pointer |
805 | that is 16-bit aligned. We must therefore ensure that what comes between here |
806 | and eptr is an odd multiple of 16 bits so as to get back into 32-bit |
807 | alignment. This happens naturally when PCRE2_UCHAR is 8 bits wide, but needs |
808 | fudges in the other cases. In the 32-bit case the padding comes first so that |
809 | the occu field itself is 32-bit aligned. Without the padding, this structure |
810 | is no longer a multiple of PCRE2_SIZE on m68k, and the check below fails. */ |
811 | |
812 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
813 | PCRE2_UCHAR occu[6]; /* Used for other case code units */ |
814 | #elif PCRE2_CODE_UNIT_WIDTH == 16 |
815 | PCRE2_UCHAR occu[2]; /* Used for other case code units */ |
816 | uint8_t unused[2]; /* Ensure 32-bit alignment (see above) */ |
817 | #else |
818 | uint8_t unused[2]; /* Ensure 32-bit alignment (see above) */ |
819 | PCRE2_UCHAR occu[1]; /* Used for other case code units */ |
820 | #endif |
821 | |
822 | /* The rest have to be copied from the previous frame whenever a new frame |
823 | becomes current. The final field is specified as a large vector so that |
824 | runtime array bound checks don't catch references to it. However, for any |
825 | specific call to pcre2_match() the memory allocated for each frame structure |
826 | allows for exactly the right size ovector for the number of capturing |
827 | parentheses. (See also the comment for pcre2_real_match_data above.) */ |
828 | |
829 | PCRE2_SPTR eptr; /* MUST BE FIRST */ |
830 | PCRE2_SPTR start_match; /* Can be adjusted by \K */ |
831 | PCRE2_SPTR mark; /* Most recent mark on the success path */ |
832 | uint32_t current_recurse; /* Current (deepest) recursion number */ |
833 | uint32_t capture_last; /* Most recent capture */ |
834 | PCRE2_SIZE last_group_offset; /* Saved offset to most recent group frame */ |
835 | PCRE2_SIZE offset_top; /* Offset after highest capture */ |
836 | PCRE2_SIZE ovector[131072]; /* Must be last in the structure */ |
837 | } heapframe; |
838 | |
839 | /* This typedef is a check that the size of the heapframe structure is a |
840 | multiple of PCRE2_SIZE. See various comments above. */ |
841 | |
842 | typedef char check_heapframe_size[ |
843 | ((sizeof(heapframe) % sizeof(PCRE2_SIZE)) == 0)? (+1):(-1)]; |
844 | |
845 | /* Structure for computing the alignment of heapframe. */ |
846 | |
847 | typedef struct heapframe_align { |
848 | char unalign; /* Completely unalign the current offset */ |
849 | heapframe frame; /* Offset is its alignment */ |
850 | } heapframe_align; |
851 | |
852 | /* This define is the minimum alignment required for a heapframe, in bytes. */ |
853 | |
854 | #define HEAPFRAME_ALIGNMENT offsetof(heapframe_align, frame) |
855 | |
856 | /* Structure for passing "static" information around between the functions |
857 | doing traditional NFA matching (pcre2_match() and friends). */ |
858 | |
859 | typedef struct match_block { |
860 | pcre2_memctl memctl; /* For general use */ |
861 | PCRE2_SIZE heap_limit; /* As it says */ |
862 | uint32_t match_limit; /* As it says */ |
863 | uint32_t match_limit_depth; /* As it says */ |
864 | uint32_t match_call_count; /* Number of times a new frame is created */ |
865 | BOOL hitend; /* Hit the end of the subject at some point */ |
866 | BOOL hasthen; /* Pattern contains (*THEN) */ |
867 | BOOL allowemptypartial; /* Allow empty hard partial */ |
868 | const uint8_t *lcc; /* Points to lower casing table */ |
869 | const uint8_t *fcc; /* Points to case-flipping table */ |
870 | const uint8_t *ctypes; /* Points to table of type maps */ |
871 | PCRE2_SIZE start_offset; /* The start offset value */ |
872 | PCRE2_SIZE end_offset_top; /* Highwater mark at end of match */ |
873 | uint16_t partial; /* PARTIAL options */ |
874 | uint16_t bsr_convention; /* \R interpretation */ |
875 | uint16_t name_count; /* Number of names in name table */ |
876 | uint16_t name_entry_size; /* Size of entry in names table */ |
877 | PCRE2_SPTR name_table; /* Table of group names */ |
878 | PCRE2_SPTR start_code; /* For use when recursing */ |
879 | PCRE2_SPTR start_subject; /* Start of the subject string */ |
880 | PCRE2_SPTR check_subject; /* Where UTF-checked from */ |
881 | PCRE2_SPTR end_subject; /* End of the subject string */ |
882 | PCRE2_SPTR end_match_ptr; /* Subject position at end match */ |
883 | PCRE2_SPTR start_used_ptr; /* Earliest consulted character */ |
884 | PCRE2_SPTR last_used_ptr; /* Latest consulted character */ |
885 | PCRE2_SPTR mark; /* Mark pointer to pass back on success */ |
886 | PCRE2_SPTR nomatch_mark; /* Mark pointer to pass back on failure */ |
887 | PCRE2_SPTR verb_ecode_ptr; /* For passing back info */ |
888 | PCRE2_SPTR verb_skip_ptr; /* For passing back a (*SKIP) name */ |
889 | uint32_t verb_current_recurse; /* Current recurse when (*VERB) happens */ |
890 | uint32_t moptions; /* Match options */ |
891 | uint32_t poptions; /* Pattern options */ |
892 | uint32_t skip_arg_count; /* For counting SKIP_ARGs */ |
893 | uint32_t ignore_skip_arg; /* For re-run when SKIP arg name not found */ |
894 | uint32_t nltype; /* Newline type */ |
895 | uint32_t nllen; /* Newline string length */ |
896 | PCRE2_UCHAR nl[4]; /* Newline string when fixed */ |
897 | pcre2_callout_block *cb; /* Points to a callout block */ |
898 | void *callout_data; /* To pass back to callouts */ |
899 | int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */ |
900 | } match_block; |
901 | |
902 | /* A similar structure is used for the same purpose by the DFA matching |
903 | functions. */ |
904 | |
905 | typedef struct dfa_match_block { |
906 | pcre2_memctl memctl; /* For general use */ |
907 | PCRE2_SPTR start_code; /* Start of the compiled pattern */ |
908 | PCRE2_SPTR start_subject ; /* Start of the subject string */ |
909 | PCRE2_SPTR end_subject; /* End of subject string */ |
910 | PCRE2_SPTR start_used_ptr; /* Earliest consulted character */ |
911 | PCRE2_SPTR last_used_ptr; /* Latest consulted character */ |
912 | const uint8_t *tables; /* Character tables */ |
913 | PCRE2_SIZE start_offset; /* The start offset value */ |
914 | PCRE2_SIZE heap_limit; /* As it says */ |
915 | PCRE2_SIZE heap_used; /* As it says */ |
916 | uint32_t match_limit; /* As it says */ |
917 | uint32_t match_limit_depth; /* As it says */ |
918 | uint32_t match_call_count; /* Number of calls of internal function */ |
919 | uint32_t moptions; /* Match options */ |
920 | uint32_t poptions; /* Pattern options */ |
921 | uint32_t nltype; /* Newline type */ |
922 | uint32_t nllen; /* Newline string length */ |
923 | BOOL allowemptypartial; /* Allow empty hard partial */ |
924 | PCRE2_UCHAR nl[4]; /* Newline string when fixed */ |
925 | uint16_t bsr_convention; /* \R interpretation */ |
926 | pcre2_callout_block *cb; /* Points to a callout block */ |
927 | void *callout_data; /* To pass back to callouts */ |
928 | int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */ |
929 | dfa_recursion_info *recursive; /* Linked list of recursion data */ |
930 | } dfa_match_block; |
931 | |
932 | #endif /* PCRE2_PCRE2TEST */ |
933 | |
934 | /* End of pcre2_intmodedep.h */ |
935 | |