1 | /************************************************* |
2 | * Perl-Compatible Regular Expressions * |
3 | *************************************************/ |
4 | |
5 | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | and semantics are as close as possible to those of the Perl 5 language. |
7 | |
8 | Written by Philip Hazel |
9 | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | New API code Copyright (c) 2016-2022 University of Cambridge |
11 | |
12 | ----------------------------------------------------------------------------- |
13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions are met: |
15 | |
16 | * Redistributions of source code must retain the above copyright notice, |
17 | this list of conditions and the following disclaimer. |
18 | |
19 | * Redistributions in binary form must reproduce the above copyright |
20 | notice, this list of conditions and the following disclaimer in the |
21 | documentation and/or other materials provided with the distribution. |
22 | |
23 | * Neither the name of the University of Cambridge nor the names of its |
24 | contributors may be used to endorse or promote products derived from |
25 | this software without specific prior written permission. |
26 | |
27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | POSSIBILITY OF SUCH DAMAGE. |
38 | ----------------------------------------------------------------------------- |
39 | */ |
40 | |
41 | /* This module contains functions that scan a compiled pattern and change |
42 | repeats into possessive repeats where possible. */ |
43 | |
44 | |
45 | #ifdef HAVE_CONFIG_H |
46 | #include "config.h" |
47 | #endif |
48 | |
49 | |
50 | #include "pcre2_internal.h" |
51 | |
52 | |
53 | /************************************************* |
54 | * Tables for auto-possessification * |
55 | *************************************************/ |
56 | |
57 | /* This table is used to check whether auto-possessification is possible |
58 | between adjacent character-type opcodes. The left-hand (repeated) opcode is |
59 | used to select the row, and the right-hand opcode is use to select the column. |
60 | A value of 1 means that auto-possessification is OK. For example, the second |
61 | value in the first row means that \D+\d can be turned into \D++\d. |
62 | |
63 | The Unicode property types (\P and \p) have to be present to fill out the table |
64 | because of what their opcode values are, but the table values should always be |
65 | zero because property types are handled separately in the code. The last four |
66 | columns apply to items that cannot be repeated, so there is no need to have |
67 | rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is |
68 | *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ |
69 | |
70 | #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1) |
71 | #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1) |
72 | |
73 | static const uint8_t autoposstab[APTROWS][APTCOLS] = { |
74 | /* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */ |
75 | { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */ |
76 | { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */ |
77 | { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */ |
78 | { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */ |
79 | { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */ |
80 | { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */ |
81 | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */ |
82 | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */ |
83 | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */ |
84 | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */ |
85 | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */ |
86 | { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */ |
87 | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */ |
88 | { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */ |
89 | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */ |
90 | { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */ |
91 | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */ |
92 | }; |
93 | |
94 | #ifdef SUPPORT_UNICODE |
95 | /* This table is used to check whether auto-possessification is possible |
96 | between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The |
97 | left-hand (repeated) opcode is used to select the row, and the right-hand |
98 | opcode is used to select the column. The values are as follows: |
99 | |
100 | 0 Always return FALSE (never auto-possessify) |
101 | 1 Character groups are distinct (possessify if both are OP_PROP) |
102 | 2 Check character categories in the same group (general or particular) |
103 | 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP) |
104 | |
105 | 4 Check left general category vs right particular category |
106 | 5 Check right general category vs left particular category |
107 | |
108 | 6 Left alphanum vs right general category |
109 | 7 Left space vs right general category |
110 | 8 Left word vs right general category |
111 | |
112 | 9 Right alphanum vs left general category |
113 | 10 Right space vs left general category |
114 | 11 Right word vs left general category |
115 | |
116 | 12 Left alphanum vs right particular category |
117 | 13 Left space vs right particular category |
118 | 14 Left word vs right particular category |
119 | |
120 | 15 Right alphanum vs left particular category |
121 | 16 Right space vs left particular category |
122 | 17 Right word vs left particular category |
123 | */ |
124 | |
125 | static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = { |
126 | /* ANY LAMP GC PC SC SCX ALNUM SPACE PXSPACE WORD CLIST UCNC BIDICL BOOL */ |
127 | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */ |
128 | { 0, 3, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_LAMP */ |
129 | { 0, 0, 2, 4, 0, 0, 9, 10, 10, 11, 0, 0, 0, 0 }, /* PT_GC */ |
130 | { 0, 0, 5, 2, 0, 0, 15, 16, 16, 17, 0, 0, 0, 0 }, /* PT_PC */ |
131 | { 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SC */ |
132 | { 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SCX */ |
133 | { 0, 3, 6, 12, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_ALNUM */ |
134 | { 0, 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_SPACE */ |
135 | { 0, 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_PXSPACE */ |
136 | { 0, 0, 8, 14, 0, 0, 0, 1, 1, 3, 0, 0, 0, 0 }, /* PT_WORD */ |
137 | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */ |
138 | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0 }, /* PT_UCNC */ |
139 | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_BIDICL */ |
140 | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } /* PT_BOOL */ |
141 | }; |
142 | |
143 | /* This table is used to check whether auto-possessification is possible |
144 | between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one |
145 | specifies a general category and the other specifies a particular category. The |
146 | row is selected by the general category and the column by the particular |
147 | category. The value is 1 if the particular category is not part of the general |
148 | category. */ |
149 | |
150 | static const uint8_t catposstab[7][30] = { |
151 | /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */ |
152 | { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */ |
153 | { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */ |
154 | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */ |
155 | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */ |
156 | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */ |
157 | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */ |
158 | { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */ |
159 | }; |
160 | |
161 | /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against |
162 | a general or particular category. The properties in each row are those |
163 | that apply to the character set in question. Duplication means that a little |
164 | unnecessary work is done when checking, but this keeps things much simpler |
165 | because they can all use the same code. For more details see the comment where |
166 | this table is used. |
167 | |
168 | Note: SPACE and PXSPACE used to be different because Perl excluded VT from |
169 | "space", but from Perl 5.18 it's included, so both categories are treated the |
170 | same here. */ |
171 | |
172 | static const uint8_t posspropstab[3][4] = { |
173 | { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */ |
174 | { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */ |
175 | { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */ |
176 | }; |
177 | #endif /* SUPPORT_UNICODE */ |
178 | |
179 | |
180 | |
181 | #ifdef SUPPORT_UNICODE |
182 | /************************************************* |
183 | * Check a character and a property * |
184 | *************************************************/ |
185 | |
186 | /* This function is called by compare_opcodes() when a property item is |
187 | adjacent to a fixed character. |
188 | |
189 | Arguments: |
190 | c the character |
191 | ptype the property type |
192 | pdata the data for the type |
193 | negated TRUE if it's a negated property (\P or \p{^) |
194 | |
195 | Returns: TRUE if auto-possessifying is OK |
196 | */ |
197 | |
198 | static BOOL |
199 | check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata, |
200 | BOOL negated) |
201 | { |
202 | BOOL ok; |
203 | const uint32_t *p; |
204 | const ucd_record *prop = GET_UCD(c); |
205 | |
206 | switch(ptype) |
207 | { |
208 | case PT_LAMP: |
209 | return (prop->chartype == ucp_Lu || |
210 | prop->chartype == ucp_Ll || |
211 | prop->chartype == ucp_Lt) == negated; |
212 | |
213 | case PT_GC: |
214 | return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated; |
215 | |
216 | case PT_PC: |
217 | return (pdata == prop->chartype) == negated; |
218 | |
219 | case PT_SC: |
220 | return (pdata == prop->script) == negated; |
221 | |
222 | case PT_SCX: |
223 | ok = (pdata == prop->script |
224 | || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0); |
225 | return ok == negated; |
226 | |
227 | /* These are specials */ |
228 | |
229 | case PT_ALNUM: |
230 | return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || |
231 | PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated; |
232 | |
233 | /* Perl space used to exclude VT, but from Perl 5.18 it is included, which |
234 | means that Perl space and POSIX space are now identical. PCRE was changed |
235 | at release 8.34. */ |
236 | |
237 | case PT_SPACE: /* Perl space */ |
238 | case PT_PXSPACE: /* POSIX space */ |
239 | switch(c) |
240 | { |
241 | HSPACE_CASES: |
242 | VSPACE_CASES: |
243 | return negated; |
244 | |
245 | default: |
246 | return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated; |
247 | } |
248 | break; /* Control never reaches here */ |
249 | |
250 | case PT_WORD: |
251 | return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || |
252 | PRIV(ucp_gentype)[prop->chartype] == ucp_N || |
253 | c == CHAR_UNDERSCORE) == negated; |
254 | |
255 | case PT_CLIST: |
256 | p = PRIV(ucd_caseless_sets) + prop->caseset; |
257 | for (;;) |
258 | { |
259 | if (c < *p) return !negated; |
260 | if (c == *p++) return negated; |
261 | } |
262 | break; /* Control never reaches here */ |
263 | |
264 | /* Haven't yet thought these through. */ |
265 | |
266 | case PT_BIDICL: |
267 | return FALSE; |
268 | |
269 | case PT_BOOL: |
270 | return FALSE; |
271 | } |
272 | |
273 | return FALSE; |
274 | } |
275 | #endif /* SUPPORT_UNICODE */ |
276 | |
277 | |
278 | |
279 | /************************************************* |
280 | * Base opcode of repeated opcodes * |
281 | *************************************************/ |
282 | |
283 | /* Returns the base opcode for repeated single character type opcodes. If the |
284 | opcode is not a repeated character type, it returns with the original value. |
285 | |
286 | Arguments: c opcode |
287 | Returns: base opcode for the type |
288 | */ |
289 | |
290 | static PCRE2_UCHAR |
291 | get_repeat_base(PCRE2_UCHAR c) |
292 | { |
293 | return (c > OP_TYPEPOSUPTO)? c : |
294 | (c >= OP_TYPESTAR)? OP_TYPESTAR : |
295 | (c >= OP_NOTSTARI)? OP_NOTSTARI : |
296 | (c >= OP_NOTSTAR)? OP_NOTSTAR : |
297 | (c >= OP_STARI)? OP_STARI : |
298 | OP_STAR; |
299 | } |
300 | |
301 | |
302 | /************************************************* |
303 | * Fill the character property list * |
304 | *************************************************/ |
305 | |
306 | /* Checks whether the code points to an opcode that can take part in auto- |
307 | possessification, and if so, fills a list with its properties. |
308 | |
309 | Arguments: |
310 | code points to start of expression |
311 | utf TRUE if in UTF mode |
312 | ucp TRUE if in UCP mode |
313 | fcc points to the case-flipping table |
314 | list points to output list |
315 | list[0] will be filled with the opcode |
316 | list[1] will be non-zero if this opcode |
317 | can match an empty character string |
318 | list[2..7] depends on the opcode |
319 | |
320 | Returns: points to the start of the next opcode if *code is accepted |
321 | NULL if *code is not accepted |
322 | */ |
323 | |
324 | static PCRE2_SPTR |
325 | get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc, |
326 | uint32_t *list) |
327 | { |
328 | PCRE2_UCHAR c = *code; |
329 | PCRE2_UCHAR base; |
330 | PCRE2_SPTR end; |
331 | uint32_t chr; |
332 | |
333 | #ifdef SUPPORT_UNICODE |
334 | uint32_t *clist_dest; |
335 | const uint32_t *clist_src; |
336 | #else |
337 | (void)utf; /* Suppress "unused parameter" compiler warnings */ |
338 | (void)ucp; |
339 | #endif |
340 | |
341 | list[0] = c; |
342 | list[1] = FALSE; |
343 | code++; |
344 | |
345 | if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) |
346 | { |
347 | base = get_repeat_base(c); |
348 | c -= (base - OP_STAR); |
349 | |
350 | if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO) |
351 | code += IMM2_SIZE; |
352 | |
353 | list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && |
354 | c != OP_POSPLUS); |
355 | |
356 | switch(base) |
357 | { |
358 | case OP_STAR: |
359 | list[0] = OP_CHAR; |
360 | break; |
361 | |
362 | case OP_STARI: |
363 | list[0] = OP_CHARI; |
364 | break; |
365 | |
366 | case OP_NOTSTAR: |
367 | list[0] = OP_NOT; |
368 | break; |
369 | |
370 | case OP_NOTSTARI: |
371 | list[0] = OP_NOTI; |
372 | break; |
373 | |
374 | case OP_TYPESTAR: |
375 | list[0] = *code; |
376 | code++; |
377 | break; |
378 | } |
379 | c = list[0]; |
380 | } |
381 | |
382 | switch(c) |
383 | { |
384 | case OP_NOT_DIGIT: |
385 | case OP_DIGIT: |
386 | case OP_NOT_WHITESPACE: |
387 | case OP_WHITESPACE: |
388 | case OP_NOT_WORDCHAR: |
389 | case OP_WORDCHAR: |
390 | case OP_ANY: |
391 | case OP_ALLANY: |
392 | case OP_ANYNL: |
393 | case OP_NOT_HSPACE: |
394 | case OP_HSPACE: |
395 | case OP_NOT_VSPACE: |
396 | case OP_VSPACE: |
397 | case OP_EXTUNI: |
398 | case OP_EODN: |
399 | case OP_EOD: |
400 | case OP_DOLL: |
401 | case OP_DOLLM: |
402 | return code; |
403 | |
404 | case OP_CHAR: |
405 | case OP_NOT: |
406 | GETCHARINCTEST(chr, code); |
407 | list[2] = chr; |
408 | list[3] = NOTACHAR; |
409 | return code; |
410 | |
411 | case OP_CHARI: |
412 | case OP_NOTI: |
413 | list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT; |
414 | GETCHARINCTEST(chr, code); |
415 | list[2] = chr; |
416 | |
417 | #ifdef SUPPORT_UNICODE |
418 | if (chr < 128 || (chr < 256 && !utf && !ucp)) |
419 | list[3] = fcc[chr]; |
420 | else |
421 | list[3] = UCD_OTHERCASE(chr); |
422 | #elif defined SUPPORT_WIDE_CHARS |
423 | list[3] = (chr < 256) ? fcc[chr] : chr; |
424 | #else |
425 | list[3] = fcc[chr]; |
426 | #endif |
427 | |
428 | /* The othercase might be the same value. */ |
429 | |
430 | if (chr == list[3]) |
431 | list[3] = NOTACHAR; |
432 | else |
433 | list[4] = NOTACHAR; |
434 | return code; |
435 | |
436 | #ifdef SUPPORT_UNICODE |
437 | case OP_PROP: |
438 | case OP_NOTPROP: |
439 | if (code[0] != PT_CLIST) |
440 | { |
441 | list[2] = code[0]; |
442 | list[3] = code[1]; |
443 | return code + 2; |
444 | } |
445 | |
446 | /* Convert only if we have enough space. */ |
447 | |
448 | clist_src = PRIV(ucd_caseless_sets) + code[1]; |
449 | clist_dest = list + 2; |
450 | code += 2; |
451 | |
452 | do { |
453 | if (clist_dest >= list + 8) |
454 | { |
455 | /* Early return if there is not enough space. This should never |
456 | happen, since all clists are shorter than 5 character now. */ |
457 | list[2] = code[0]; |
458 | list[3] = code[1]; |
459 | return code; |
460 | } |
461 | *clist_dest++ = *clist_src; |
462 | } |
463 | while(*clist_src++ != NOTACHAR); |
464 | |
465 | /* All characters are stored. The terminating NOTACHAR is copied from the |
466 | clist itself. */ |
467 | |
468 | list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT; |
469 | return code; |
470 | #endif |
471 | |
472 | case OP_NCLASS: |
473 | case OP_CLASS: |
474 | #ifdef SUPPORT_WIDE_CHARS |
475 | case OP_XCLASS: |
476 | if (c == OP_XCLASS) |
477 | end = code + GET(code, 0) - 1; |
478 | else |
479 | #endif |
480 | end = code + 32 / sizeof(PCRE2_UCHAR); |
481 | |
482 | switch(*end) |
483 | { |
484 | case OP_CRSTAR: |
485 | case OP_CRMINSTAR: |
486 | case OP_CRQUERY: |
487 | case OP_CRMINQUERY: |
488 | case OP_CRPOSSTAR: |
489 | case OP_CRPOSQUERY: |
490 | list[1] = TRUE; |
491 | end++; |
492 | break; |
493 | |
494 | case OP_CRPLUS: |
495 | case OP_CRMINPLUS: |
496 | case OP_CRPOSPLUS: |
497 | end++; |
498 | break; |
499 | |
500 | case OP_CRRANGE: |
501 | case OP_CRMINRANGE: |
502 | case OP_CRPOSRANGE: |
503 | list[1] = (GET2(end, 1) == 0); |
504 | end += 1 + 2 * IMM2_SIZE; |
505 | break; |
506 | } |
507 | list[2] = (uint32_t)(end - code); |
508 | return end; |
509 | } |
510 | |
511 | return NULL; /* Opcode not accepted */ |
512 | } |
513 | |
514 | |
515 | |
516 | /************************************************* |
517 | * Scan further character sets for match * |
518 | *************************************************/ |
519 | |
520 | /* Checks whether the base and the current opcode have a common character, in |
521 | which case the base cannot be possessified. |
522 | |
523 | Arguments: |
524 | code points to the byte code |
525 | utf TRUE in UTF mode |
526 | ucp TRUE in UCP mode |
527 | cb compile data block |
528 | base_list the data list of the base opcode |
529 | base_end the end of the base opcode |
530 | rec_limit points to recursion depth counter |
531 | |
532 | Returns: TRUE if the auto-possessification is possible |
533 | */ |
534 | |
535 | static BOOL |
536 | compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb, |
537 | const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit) |
538 | { |
539 | PCRE2_UCHAR c; |
540 | uint32_t list[8]; |
541 | const uint32_t *chr_ptr; |
542 | const uint32_t *ochr_ptr; |
543 | const uint32_t *list_ptr; |
544 | PCRE2_SPTR next_code; |
545 | #ifdef SUPPORT_WIDE_CHARS |
546 | PCRE2_SPTR xclass_flags; |
547 | #endif |
548 | const uint8_t *class_bitset; |
549 | const uint8_t *set1, *set2, *set_end; |
550 | uint32_t chr; |
551 | BOOL accepted, invert_bits; |
552 | BOOL entered_a_group = FALSE; |
553 | |
554 | if (--(*rec_limit) <= 0) return FALSE; /* Recursion has gone too deep */ |
555 | |
556 | /* Note: the base_list[1] contains whether the current opcode has a greedy |
557 | (represented by a non-zero value) quantifier. This is a different from |
558 | other character type lists, which store here that the character iterator |
559 | matches to an empty string (also represented by a non-zero value). */ |
560 | |
561 | for(;;) |
562 | { |
563 | /* All operations move the code pointer forward. |
564 | Therefore infinite recursions are not possible. */ |
565 | |
566 | c = *code; |
567 | |
568 | /* Skip over callouts */ |
569 | |
570 | if (c == OP_CALLOUT) |
571 | { |
572 | code += PRIV(OP_lengths)[c]; |
573 | continue; |
574 | } |
575 | |
576 | if (c == OP_CALLOUT_STR) |
577 | { |
578 | code += GET(code, 1 + 2*LINK_SIZE); |
579 | continue; |
580 | } |
581 | |
582 | /* At the end of a branch, skip to the end of the group. */ |
583 | |
584 | if (c == OP_ALT) |
585 | { |
586 | do code += GET(code, 1); while (*code == OP_ALT); |
587 | c = *code; |
588 | } |
589 | |
590 | /* Inspect the next opcode. */ |
591 | |
592 | switch(c) |
593 | { |
594 | /* We can always possessify a greedy iterator at the end of the pattern, |
595 | which is reached after skipping over the final OP_KET. A non-greedy |
596 | iterator must never be possessified. */ |
597 | |
598 | case OP_END: |
599 | return base_list[1] != 0; |
600 | |
601 | /* When an iterator is at the end of certain kinds of group we can inspect |
602 | what follows the group by skipping over the closing ket. Note that this |
603 | does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given |
604 | iteration is variable (could be another iteration or could be the next |
605 | item). As these two opcodes are not listed in the next switch, they will |
606 | end up as the next code to inspect, and return FALSE by virtue of being |
607 | unsupported. */ |
608 | |
609 | case OP_KET: |
610 | case OP_KETRPOS: |
611 | /* The non-greedy case cannot be converted to a possessive form. */ |
612 | |
613 | if (base_list[1] == 0) return FALSE; |
614 | |
615 | /* If the bracket is capturing it might be referenced by an OP_RECURSE |
616 | so its last iterator can never be possessified if the pattern contains |
617 | recursions. (This could be improved by keeping a list of group numbers that |
618 | are called by recursion.) */ |
619 | |
620 | switch(*(code - GET(code, 1))) |
621 | { |
622 | case OP_CBRA: |
623 | case OP_SCBRA: |
624 | case OP_CBRAPOS: |
625 | case OP_SCBRAPOS: |
626 | if (cb->had_recurse) return FALSE; |
627 | break; |
628 | |
629 | /* A script run might have to backtrack if the iterated item can match |
630 | characters from more than one script. So give up unless repeating an |
631 | explicit character. */ |
632 | |
633 | case OP_SCRIPT_RUN: |
634 | if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI) |
635 | return FALSE; |
636 | break; |
637 | |
638 | /* Atomic sub-patterns and assertions can always auto-possessify their |
639 | last iterator. However, if the group was entered as a result of checking |
640 | a previous iterator, this is not possible. */ |
641 | |
642 | case OP_ASSERT: |
643 | case OP_ASSERT_NOT: |
644 | case OP_ASSERTBACK: |
645 | case OP_ASSERTBACK_NOT: |
646 | case OP_ONCE: |
647 | return !entered_a_group; |
648 | |
649 | /* Non-atomic assertions - don't possessify last iterator. This needs |
650 | more thought. */ |
651 | |
652 | case OP_ASSERT_NA: |
653 | case OP_ASSERTBACK_NA: |
654 | return FALSE; |
655 | } |
656 | |
657 | /* Skip over the bracket and inspect what comes next. */ |
658 | |
659 | code += PRIV(OP_lengths)[c]; |
660 | continue; |
661 | |
662 | /* Handle cases where the next item is a group. */ |
663 | |
664 | case OP_ONCE: |
665 | case OP_BRA: |
666 | case OP_CBRA: |
667 | next_code = code + GET(code, 1); |
668 | code += PRIV(OP_lengths)[c]; |
669 | |
670 | /* Check each branch. We have to recurse a level for all but the last |
671 | branch. */ |
672 | |
673 | while (*next_code == OP_ALT) |
674 | { |
675 | if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit)) |
676 | return FALSE; |
677 | code = next_code + 1 + LINK_SIZE; |
678 | next_code += GET(next_code, 1); |
679 | } |
680 | |
681 | entered_a_group = TRUE; |
682 | continue; |
683 | |
684 | case OP_BRAZERO: |
685 | case OP_BRAMINZERO: |
686 | |
687 | next_code = code + 1; |
688 | if (*next_code != OP_BRA && *next_code != OP_CBRA && |
689 | *next_code != OP_ONCE) return FALSE; |
690 | |
691 | do next_code += GET(next_code, 1); while (*next_code == OP_ALT); |
692 | |
693 | /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */ |
694 | |
695 | next_code += 1 + LINK_SIZE; |
696 | if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end, |
697 | rec_limit)) |
698 | return FALSE; |
699 | |
700 | code += PRIV(OP_lengths)[c]; |
701 | continue; |
702 | |
703 | /* The next opcode does not need special handling; fall through and use it |
704 | to see if the base can be possessified. */ |
705 | |
706 | default: |
707 | break; |
708 | } |
709 | |
710 | /* We now have the next appropriate opcode to compare with the base. Check |
711 | for a supported opcode, and load its properties. */ |
712 | |
713 | code = get_chr_property_list(code, utf, ucp, cb->fcc, list); |
714 | if (code == NULL) return FALSE; /* Unsupported */ |
715 | |
716 | /* If either opcode is a small character list, set pointers for comparing |
717 | characters from that list with another list, or with a property. */ |
718 | |
719 | if (base_list[0] == OP_CHAR) |
720 | { |
721 | chr_ptr = base_list + 2; |
722 | list_ptr = list; |
723 | } |
724 | else if (list[0] == OP_CHAR) |
725 | { |
726 | chr_ptr = list + 2; |
727 | list_ptr = base_list; |
728 | } |
729 | |
730 | /* Character bitsets can also be compared to certain opcodes. */ |
731 | |
732 | else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS |
733 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
734 | /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */ |
735 | || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS)) |
736 | #endif |
737 | ) |
738 | { |
739 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
740 | if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS)) |
741 | #else |
742 | if (base_list[0] == OP_CLASS) |
743 | #endif |
744 | { |
745 | set1 = (uint8_t *)(base_end - base_list[2]); |
746 | list_ptr = list; |
747 | } |
748 | else |
749 | { |
750 | set1 = (uint8_t *)(code - list[2]); |
751 | list_ptr = base_list; |
752 | } |
753 | |
754 | invert_bits = FALSE; |
755 | switch(list_ptr[0]) |
756 | { |
757 | case OP_CLASS: |
758 | case OP_NCLASS: |
759 | set2 = (uint8_t *) |
760 | ((list_ptr == list ? code : base_end) - list_ptr[2]); |
761 | break; |
762 | |
763 | #ifdef SUPPORT_WIDE_CHARS |
764 | case OP_XCLASS: |
765 | xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE; |
766 | if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE; |
767 | if ((*xclass_flags & XCL_MAP) == 0) |
768 | { |
769 | /* No bits are set for characters < 256. */ |
770 | if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0; |
771 | /* Might be an empty repeat. */ |
772 | continue; |
773 | } |
774 | set2 = (uint8_t *)(xclass_flags + 1); |
775 | break; |
776 | #endif |
777 | |
778 | case OP_NOT_DIGIT: |
779 | invert_bits = TRUE; |
780 | /* Fall through */ |
781 | case OP_DIGIT: |
782 | set2 = (uint8_t *)(cb->cbits + cbit_digit); |
783 | break; |
784 | |
785 | case OP_NOT_WHITESPACE: |
786 | invert_bits = TRUE; |
787 | /* Fall through */ |
788 | case OP_WHITESPACE: |
789 | set2 = (uint8_t *)(cb->cbits + cbit_space); |
790 | break; |
791 | |
792 | case OP_NOT_WORDCHAR: |
793 | invert_bits = TRUE; |
794 | /* Fall through */ |
795 | case OP_WORDCHAR: |
796 | set2 = (uint8_t *)(cb->cbits + cbit_word); |
797 | break; |
798 | |
799 | default: |
800 | return FALSE; |
801 | } |
802 | |
803 | /* Because the bit sets are unaligned bytes, we need to perform byte |
804 | comparison here. */ |
805 | |
806 | set_end = set1 + 32; |
807 | if (invert_bits) |
808 | { |
809 | do |
810 | { |
811 | if ((*set1++ & ~(*set2++)) != 0) return FALSE; |
812 | } |
813 | while (set1 < set_end); |
814 | } |
815 | else |
816 | { |
817 | do |
818 | { |
819 | if ((*set1++ & *set2++) != 0) return FALSE; |
820 | } |
821 | while (set1 < set_end); |
822 | } |
823 | |
824 | if (list[1] == 0) return TRUE; |
825 | /* Might be an empty repeat. */ |
826 | continue; |
827 | } |
828 | |
829 | /* Some property combinations also acceptable. Unicode property opcodes are |
830 | processed specially; the rest can be handled with a lookup table. */ |
831 | |
832 | else |
833 | { |
834 | uint32_t leftop, rightop; |
835 | |
836 | leftop = base_list[0]; |
837 | rightop = list[0]; |
838 | |
839 | #ifdef SUPPORT_UNICODE |
840 | accepted = FALSE; /* Always set in non-unicode case. */ |
841 | if (leftop == OP_PROP || leftop == OP_NOTPROP) |
842 | { |
843 | if (rightop == OP_EOD) |
844 | accepted = TRUE; |
845 | else if (rightop == OP_PROP || rightop == OP_NOTPROP) |
846 | { |
847 | int n; |
848 | const uint8_t *p; |
849 | BOOL same = leftop == rightop; |
850 | BOOL lisprop = leftop == OP_PROP; |
851 | BOOL risprop = rightop == OP_PROP; |
852 | BOOL bothprop = lisprop && risprop; |
853 | |
854 | /* There's a table that specifies how each combination is to be |
855 | processed: |
856 | 0 Always return FALSE (never auto-possessify) |
857 | 1 Character groups are distinct (possessify if both are OP_PROP) |
858 | 2 Check character categories in the same group (general or particular) |
859 | 3 Return TRUE if the two opcodes are not the same |
860 | ... see comments below |
861 | */ |
862 | |
863 | n = propposstab[base_list[2]][list[2]]; |
864 | switch(n) |
865 | { |
866 | case 0: break; |
867 | case 1: accepted = bothprop; break; |
868 | case 2: accepted = (base_list[3] == list[3]) != same; break; |
869 | case 3: accepted = !same; break; |
870 | |
871 | case 4: /* Left general category, right particular category */ |
872 | accepted = risprop && catposstab[base_list[3]][list[3]] == same; |
873 | break; |
874 | |
875 | case 5: /* Right general category, left particular category */ |
876 | accepted = lisprop && catposstab[list[3]][base_list[3]] == same; |
877 | break; |
878 | |
879 | /* This code is logically tricky. Think hard before fiddling with it. |
880 | The posspropstab table has four entries per row. Each row relates to |
881 | one of PCRE's special properties such as ALNUM or SPACE or WORD. |
882 | Only WORD actually needs all four entries, but using repeats for the |
883 | others means they can all use the same code below. |
884 | |
885 | The first two entries in each row are Unicode general categories, and |
886 | apply always, because all the characters they include are part of the |
887 | PCRE character set. The third and fourth entries are a general and a |
888 | particular category, respectively, that include one or more relevant |
889 | characters. One or the other is used, depending on whether the check |
890 | is for a general or a particular category. However, in both cases the |
891 | category contains more characters than the specials that are defined |
892 | for the property being tested against. Therefore, it cannot be used |
893 | in a NOTPROP case. |
894 | |
895 | Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po. |
896 | Underscore is covered by ucp_P or ucp_Po. */ |
897 | |
898 | case 6: /* Left alphanum vs right general category */ |
899 | case 7: /* Left space vs right general category */ |
900 | case 8: /* Left word vs right general category */ |
901 | p = posspropstab[n-6]; |
902 | accepted = risprop && lisprop == |
903 | (list[3] != p[0] && |
904 | list[3] != p[1] && |
905 | (list[3] != p[2] || !lisprop)); |
906 | break; |
907 | |
908 | case 9: /* Right alphanum vs left general category */ |
909 | case 10: /* Right space vs left general category */ |
910 | case 11: /* Right word vs left general category */ |
911 | p = posspropstab[n-9]; |
912 | accepted = lisprop && risprop == |
913 | (base_list[3] != p[0] && |
914 | base_list[3] != p[1] && |
915 | (base_list[3] != p[2] || !risprop)); |
916 | break; |
917 | |
918 | case 12: /* Left alphanum vs right particular category */ |
919 | case 13: /* Left space vs right particular category */ |
920 | case 14: /* Left word vs right particular category */ |
921 | p = posspropstab[n-12]; |
922 | accepted = risprop && lisprop == |
923 | (catposstab[p[0]][list[3]] && |
924 | catposstab[p[1]][list[3]] && |
925 | (list[3] != p[3] || !lisprop)); |
926 | break; |
927 | |
928 | case 15: /* Right alphanum vs left particular category */ |
929 | case 16: /* Right space vs left particular category */ |
930 | case 17: /* Right word vs left particular category */ |
931 | p = posspropstab[n-15]; |
932 | accepted = lisprop && risprop == |
933 | (catposstab[p[0]][base_list[3]] && |
934 | catposstab[p[1]][base_list[3]] && |
935 | (base_list[3] != p[3] || !risprop)); |
936 | break; |
937 | } |
938 | } |
939 | } |
940 | |
941 | else |
942 | #endif /* SUPPORT_UNICODE */ |
943 | |
944 | accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP && |
945 | rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP && |
946 | autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP]; |
947 | |
948 | if (!accepted) return FALSE; |
949 | |
950 | if (list[1] == 0) return TRUE; |
951 | /* Might be an empty repeat. */ |
952 | continue; |
953 | } |
954 | |
955 | /* Control reaches here only if one of the items is a small character list. |
956 | All characters are checked against the other side. */ |
957 | |
958 | do |
959 | { |
960 | chr = *chr_ptr; |
961 | |
962 | switch(list_ptr[0]) |
963 | { |
964 | case OP_CHAR: |
965 | ochr_ptr = list_ptr + 2; |
966 | do |
967 | { |
968 | if (chr == *ochr_ptr) return FALSE; |
969 | ochr_ptr++; |
970 | } |
971 | while(*ochr_ptr != NOTACHAR); |
972 | break; |
973 | |
974 | case OP_NOT: |
975 | ochr_ptr = list_ptr + 2; |
976 | do |
977 | { |
978 | if (chr == *ochr_ptr) |
979 | break; |
980 | ochr_ptr++; |
981 | } |
982 | while(*ochr_ptr != NOTACHAR); |
983 | if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */ |
984 | break; |
985 | |
986 | /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not* |
987 | set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ |
988 | |
989 | case OP_DIGIT: |
990 | if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE; |
991 | break; |
992 | |
993 | case OP_NOT_DIGIT: |
994 | if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE; |
995 | break; |
996 | |
997 | case OP_WHITESPACE: |
998 | if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE; |
999 | break; |
1000 | |
1001 | case OP_NOT_WHITESPACE: |
1002 | if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE; |
1003 | break; |
1004 | |
1005 | case OP_WORDCHAR: |
1006 | if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE; |
1007 | break; |
1008 | |
1009 | case OP_NOT_WORDCHAR: |
1010 | if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE; |
1011 | break; |
1012 | |
1013 | case OP_HSPACE: |
1014 | switch(chr) |
1015 | { |
1016 | HSPACE_CASES: return FALSE; |
1017 | default: break; |
1018 | } |
1019 | break; |
1020 | |
1021 | case OP_NOT_HSPACE: |
1022 | switch(chr) |
1023 | { |
1024 | HSPACE_CASES: break; |
1025 | default: return FALSE; |
1026 | } |
1027 | break; |
1028 | |
1029 | case OP_ANYNL: |
1030 | case OP_VSPACE: |
1031 | switch(chr) |
1032 | { |
1033 | VSPACE_CASES: return FALSE; |
1034 | default: break; |
1035 | } |
1036 | break; |
1037 | |
1038 | case OP_NOT_VSPACE: |
1039 | switch(chr) |
1040 | { |
1041 | VSPACE_CASES: break; |
1042 | default: return FALSE; |
1043 | } |
1044 | break; |
1045 | |
1046 | case OP_DOLL: |
1047 | case OP_EODN: |
1048 | switch (chr) |
1049 | { |
1050 | case CHAR_CR: |
1051 | case CHAR_LF: |
1052 | case CHAR_VT: |
1053 | case CHAR_FF: |
1054 | case CHAR_NEL: |
1055 | #ifndef EBCDIC |
1056 | case 0x2028: |
1057 | case 0x2029: |
1058 | #endif /* Not EBCDIC */ |
1059 | return FALSE; |
1060 | } |
1061 | break; |
1062 | |
1063 | case OP_EOD: /* Can always possessify before \z */ |
1064 | break; |
1065 | |
1066 | #ifdef SUPPORT_UNICODE |
1067 | case OP_PROP: |
1068 | case OP_NOTPROP: |
1069 | if (!check_char_prop(chr, list_ptr[2], list_ptr[3], |
1070 | list_ptr[0] == OP_NOTPROP)) |
1071 | return FALSE; |
1072 | break; |
1073 | #endif |
1074 | |
1075 | case OP_NCLASS: |
1076 | if (chr > 255) return FALSE; |
1077 | /* Fall through */ |
1078 | |
1079 | case OP_CLASS: |
1080 | if (chr > 255) break; |
1081 | class_bitset = (uint8_t *) |
1082 | ((list_ptr == list ? code : base_end) - list_ptr[2]); |
1083 | if ((class_bitset[chr >> 3] & (1u << (chr & 7))) != 0) return FALSE; |
1084 | break; |
1085 | |
1086 | #ifdef SUPPORT_WIDE_CHARS |
1087 | case OP_XCLASS: |
1088 | if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) - |
1089 | list_ptr[2] + LINK_SIZE, utf)) return FALSE; |
1090 | break; |
1091 | #endif |
1092 | |
1093 | default: |
1094 | return FALSE; |
1095 | } |
1096 | |
1097 | chr_ptr++; |
1098 | } |
1099 | while(*chr_ptr != NOTACHAR); |
1100 | |
1101 | /* At least one character must be matched from this opcode. */ |
1102 | |
1103 | if (list[1] == 0) return TRUE; |
1104 | } |
1105 | |
1106 | /* Control never reaches here. There used to be a fail-save return FALSE; here, |
1107 | but some compilers complain about an unreachable statement. */ |
1108 | } |
1109 | |
1110 | |
1111 | |
1112 | /************************************************* |
1113 | * Scan compiled regex for auto-possession * |
1114 | *************************************************/ |
1115 | |
1116 | /* Replaces single character iterations with their possessive alternatives |
1117 | if appropriate. This function modifies the compiled opcode! Hitting a |
1118 | non-existent opcode may indicate a bug in PCRE2, but it can also be caused if a |
1119 | bad UTF string was compiled with PCRE2_NO_UTF_CHECK. The rec_limit catches |
1120 | overly complicated or large patterns. In these cases, the check just stops, |
1121 | leaving the remainder of the pattern unpossessified. |
1122 | |
1123 | Arguments: |
1124 | code points to start of the byte code |
1125 | cb compile data block |
1126 | |
1127 | Returns: 0 for success |
1128 | -1 if a non-existant opcode is encountered |
1129 | */ |
1130 | |
1131 | int |
1132 | PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb) |
1133 | { |
1134 | PCRE2_UCHAR c; |
1135 | PCRE2_SPTR end; |
1136 | PCRE2_UCHAR *repeat_opcode; |
1137 | uint32_t list[8]; |
1138 | int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */ |
1139 | BOOL utf = (cb->external_options & PCRE2_UTF) != 0; |
1140 | BOOL ucp = (cb->external_options & PCRE2_UCP) != 0; |
1141 | |
1142 | for (;;) |
1143 | { |
1144 | c = *code; |
1145 | |
1146 | if (c >= OP_TABLE_LENGTH) return -1; /* Something gone wrong */ |
1147 | |
1148 | if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) |
1149 | { |
1150 | c -= get_repeat_base(c) - OP_STAR; |
1151 | end = (c <= OP_MINUPTO) ? |
1152 | get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL; |
1153 | list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; |
1154 | |
1155 | if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end, |
1156 | &rec_limit)) |
1157 | { |
1158 | switch(c) |
1159 | { |
1160 | case OP_STAR: |
1161 | *code += OP_POSSTAR - OP_STAR; |
1162 | break; |
1163 | |
1164 | case OP_MINSTAR: |
1165 | *code += OP_POSSTAR - OP_MINSTAR; |
1166 | break; |
1167 | |
1168 | case OP_PLUS: |
1169 | *code += OP_POSPLUS - OP_PLUS; |
1170 | break; |
1171 | |
1172 | case OP_MINPLUS: |
1173 | *code += OP_POSPLUS - OP_MINPLUS; |
1174 | break; |
1175 | |
1176 | case OP_QUERY: |
1177 | *code += OP_POSQUERY - OP_QUERY; |
1178 | break; |
1179 | |
1180 | case OP_MINQUERY: |
1181 | *code += OP_POSQUERY - OP_MINQUERY; |
1182 | break; |
1183 | |
1184 | case OP_UPTO: |
1185 | *code += OP_POSUPTO - OP_UPTO; |
1186 | break; |
1187 | |
1188 | case OP_MINUPTO: |
1189 | *code += OP_POSUPTO - OP_MINUPTO; |
1190 | break; |
1191 | } |
1192 | } |
1193 | c = *code; |
1194 | } |
1195 | else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS) |
1196 | { |
1197 | #ifdef SUPPORT_WIDE_CHARS |
1198 | if (c == OP_XCLASS) |
1199 | repeat_opcode = code + GET(code, 1); |
1200 | else |
1201 | #endif |
1202 | repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR)); |
1203 | |
1204 | c = *repeat_opcode; |
1205 | if (c >= OP_CRSTAR && c <= OP_CRMINRANGE) |
1206 | { |
1207 | /* The return from get_chr_property_list() will never be NULL when |
1208 | *code (aka c) is one of the three class opcodes. However, gcc with |
1209 | -fanalyzer notes that a NULL return is possible, and grumbles. Hence we |
1210 | put in a check. */ |
1211 | |
1212 | end = get_chr_property_list(code, utf, ucp, cb->fcc, list); |
1213 | list[1] = (c & 1) == 0; |
1214 | |
1215 | if (end != NULL && |
1216 | compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit)) |
1217 | { |
1218 | switch (c) |
1219 | { |
1220 | case OP_CRSTAR: |
1221 | case OP_CRMINSTAR: |
1222 | *repeat_opcode = OP_CRPOSSTAR; |
1223 | break; |
1224 | |
1225 | case OP_CRPLUS: |
1226 | case OP_CRMINPLUS: |
1227 | *repeat_opcode = OP_CRPOSPLUS; |
1228 | break; |
1229 | |
1230 | case OP_CRQUERY: |
1231 | case OP_CRMINQUERY: |
1232 | *repeat_opcode = OP_CRPOSQUERY; |
1233 | break; |
1234 | |
1235 | case OP_CRRANGE: |
1236 | case OP_CRMINRANGE: |
1237 | *repeat_opcode = OP_CRPOSRANGE; |
1238 | break; |
1239 | } |
1240 | } |
1241 | } |
1242 | c = *code; |
1243 | } |
1244 | |
1245 | switch(c) |
1246 | { |
1247 | case OP_END: |
1248 | return 0; |
1249 | |
1250 | case OP_TYPESTAR: |
1251 | case OP_TYPEMINSTAR: |
1252 | case OP_TYPEPLUS: |
1253 | case OP_TYPEMINPLUS: |
1254 | case OP_TYPEQUERY: |
1255 | case OP_TYPEMINQUERY: |
1256 | case OP_TYPEPOSSTAR: |
1257 | case OP_TYPEPOSPLUS: |
1258 | case OP_TYPEPOSQUERY: |
1259 | if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
1260 | break; |
1261 | |
1262 | case OP_TYPEUPTO: |
1263 | case OP_TYPEMINUPTO: |
1264 | case OP_TYPEEXACT: |
1265 | case OP_TYPEPOSUPTO: |
1266 | if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) |
1267 | code += 2; |
1268 | break; |
1269 | |
1270 | case OP_CALLOUT_STR: |
1271 | code += GET(code, 1 + 2*LINK_SIZE); |
1272 | break; |
1273 | |
1274 | #ifdef SUPPORT_WIDE_CHARS |
1275 | case OP_XCLASS: |
1276 | code += GET(code, 1); |
1277 | break; |
1278 | #endif |
1279 | |
1280 | case OP_MARK: |
1281 | case OP_COMMIT_ARG: |
1282 | case OP_PRUNE_ARG: |
1283 | case OP_SKIP_ARG: |
1284 | case OP_THEN_ARG: |
1285 | code += code[1]; |
1286 | break; |
1287 | } |
1288 | |
1289 | /* Add in the fixed length from the table */ |
1290 | |
1291 | code += PRIV(OP_lengths)[c]; |
1292 | |
1293 | /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be |
1294 | followed by a multi-byte character. The length in the table is a minimum, so |
1295 | we have to arrange to skip the extra code units. */ |
1296 | |
1297 | #ifdef MAYBE_UTF_MULTI |
1298 | if (utf) switch(c) |
1299 | { |
1300 | case OP_CHAR: |
1301 | case OP_CHARI: |
1302 | case OP_NOT: |
1303 | case OP_NOTI: |
1304 | case OP_STAR: |
1305 | case OP_MINSTAR: |
1306 | case OP_PLUS: |
1307 | case OP_MINPLUS: |
1308 | case OP_QUERY: |
1309 | case OP_MINQUERY: |
1310 | case OP_UPTO: |
1311 | case OP_MINUPTO: |
1312 | case OP_EXACT: |
1313 | case OP_POSSTAR: |
1314 | case OP_POSPLUS: |
1315 | case OP_POSQUERY: |
1316 | case OP_POSUPTO: |
1317 | case OP_STARI: |
1318 | case OP_MINSTARI: |
1319 | case OP_PLUSI: |
1320 | case OP_MINPLUSI: |
1321 | case OP_QUERYI: |
1322 | case OP_MINQUERYI: |
1323 | case OP_UPTOI: |
1324 | case OP_MINUPTOI: |
1325 | case OP_EXACTI: |
1326 | case OP_POSSTARI: |
1327 | case OP_POSPLUSI: |
1328 | case OP_POSQUERYI: |
1329 | case OP_POSUPTOI: |
1330 | case OP_NOTSTAR: |
1331 | case OP_NOTMINSTAR: |
1332 | case OP_NOTPLUS: |
1333 | case OP_NOTMINPLUS: |
1334 | case OP_NOTQUERY: |
1335 | case OP_NOTMINQUERY: |
1336 | case OP_NOTUPTO: |
1337 | case OP_NOTMINUPTO: |
1338 | case OP_NOTEXACT: |
1339 | case OP_NOTPOSSTAR: |
1340 | case OP_NOTPOSPLUS: |
1341 | case OP_NOTPOSQUERY: |
1342 | case OP_NOTPOSUPTO: |
1343 | case OP_NOTSTARI: |
1344 | case OP_NOTMINSTARI: |
1345 | case OP_NOTPLUSI: |
1346 | case OP_NOTMINPLUSI: |
1347 | case OP_NOTQUERYI: |
1348 | case OP_NOTMINQUERYI: |
1349 | case OP_NOTUPTOI: |
1350 | case OP_NOTMINUPTOI: |
1351 | case OP_NOTEXACTI: |
1352 | case OP_NOTPOSSTARI: |
1353 | case OP_NOTPOSPLUSI: |
1354 | case OP_NOTPOSQUERYI: |
1355 | case OP_NOTPOSUPTOI: |
1356 | if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); |
1357 | break; |
1358 | } |
1359 | #else |
1360 | (void)(utf); /* Keep compiler happy by referencing function argument */ |
1361 | #endif /* SUPPORT_WIDE_CHARS */ |
1362 | } |
1363 | } |
1364 | |
1365 | /* End of pcre2_auto_possess.c */ |
1366 | |