1 | /************************************************* |
2 | * Perl-Compatible Regular Expressions * |
3 | *************************************************/ |
4 | |
5 | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | and semantics are as close as possible to those of the Perl 5 language. |
7 | |
8 | Written by Philip Hazel |
9 | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | New API code Copyright (c) 2015-2022 University of Cambridge |
11 | |
12 | ----------------------------------------------------------------------------- |
13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions are met: |
15 | |
16 | * Redistributions of source code must retain the above copyright notice, |
17 | this list of conditions and the following disclaimer. |
18 | |
19 | * Redistributions in binary form must reproduce the above copyright |
20 | notice, this list of conditions and the following disclaimer in the |
21 | documentation and/or other materials provided with the distribution. |
22 | |
23 | * Neither the name of the University of Cambridge nor the names of its |
24 | contributors may be used to endorse or promote products derived from |
25 | this software without specific prior written permission. |
26 | |
27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | POSSIBILITY OF SUCH DAMAGE. |
38 | ----------------------------------------------------------------------------- |
39 | */ |
40 | |
41 | |
42 | #ifdef HAVE_CONFIG_H |
43 | #include "config.h" |
44 | #endif |
45 | |
46 | /* These defines enable debugging code */ |
47 | |
48 | /* #define DEBUG_FRAMES_DISPLAY */ |
49 | /* #define DEBUG_SHOW_OPS */ |
50 | /* #define DEBUG_SHOW_RMATCH */ |
51 | |
52 | #ifdef DEBUG_FRAMES_DISPLAY |
53 | #include <stdarg.h> |
54 | #endif |
55 | |
56 | /* These defines identify the name of the block containing "static" |
57 | information, and fields within it. */ |
58 | |
59 | #define NLBLOCK mb /* Block containing newline information */ |
60 | #define PSSTART start_subject /* Field containing processed string start */ |
61 | #define PSEND end_subject /* Field containing processed string end */ |
62 | |
63 | #include "pcre2_internal.h" |
64 | |
65 | #define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */ |
66 | |
67 | /* Masks for identifying the public options that are permitted at match time. */ |
68 | |
69 | #define PUBLIC_MATCH_OPTIONS \ |
70 | (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \ |
71 | PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \ |
72 | PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT) |
73 | |
74 | #define PUBLIC_JIT_MATCH_OPTIONS \ |
75 | (PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\ |
76 | PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD|\ |
77 | PCRE2_COPY_MATCHED_SUBJECT) |
78 | |
79 | /* Non-error returns from and within the match() function. Error returns are |
80 | externally defined PCRE2_ERROR_xxx codes, which are all negative. */ |
81 | |
82 | #define MATCH_MATCH 1 |
83 | #define MATCH_NOMATCH 0 |
84 | |
85 | /* Special internal returns used in the match() function. Make them |
86 | sufficiently negative to avoid the external error codes. */ |
87 | |
88 | #define MATCH_ACCEPT (-999) |
89 | #define MATCH_KETRPOS (-998) |
90 | /* The next 5 must be kept together and in sequence so that a test that checks |
91 | for any one of them can use a range. */ |
92 | #define MATCH_COMMIT (-997) |
93 | #define MATCH_PRUNE (-996) |
94 | #define MATCH_SKIP (-995) |
95 | #define MATCH_SKIP_ARG (-994) |
96 | #define MATCH_THEN (-993) |
97 | #define MATCH_BACKTRACK_MAX MATCH_THEN |
98 | #define MATCH_BACKTRACK_MIN MATCH_COMMIT |
99 | |
100 | /* Group frame type values. Zero means the frame is not a group frame. The |
101 | lower 16 bits are used for data (e.g. the capture number). Group frames are |
102 | used for most groups so that information about the start is easily available at |
103 | the end without having to scan back through intermediate frames (backtrack |
104 | points). */ |
105 | |
106 | #define GF_CAPTURE 0x00010000u |
107 | #define GF_NOCAPTURE 0x00020000u |
108 | #define GF_CONDASSERT 0x00030000u |
109 | #define GF_RECURSE 0x00040000u |
110 | |
111 | /* Masks for the identity and data parts of the group frame type. */ |
112 | |
113 | #define GF_IDMASK(a) ((a) & 0xffff0000u) |
114 | #define GF_DATAMASK(a) ((a) & 0x0000ffffu) |
115 | |
116 | /* Repetition types */ |
117 | |
118 | enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS }; |
119 | |
120 | /* Min and max values for the common repeats; a maximum of UINT32_MAX => |
121 | infinity. */ |
122 | |
123 | static const uint32_t rep_min[] = { |
124 | 0, 0, /* * and *? */ |
125 | 1, 1, /* + and +? */ |
126 | 0, 0, /* ? and ?? */ |
127 | 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */ |
128 | 0, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */ |
129 | |
130 | static const uint32_t rep_max[] = { |
131 | UINT32_MAX, UINT32_MAX, /* * and *? */ |
132 | UINT32_MAX, UINT32_MAX, /* + and +? */ |
133 | 1, 1, /* ? and ?? */ |
134 | 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */ |
135 | UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */ |
136 | |
137 | /* Repetition types - must include OP_CRPOSRANGE (not needed above) */ |
138 | |
139 | static const uint32_t rep_typ[] = { |
140 | REPTYPE_MAX, REPTYPE_MIN, /* * and *? */ |
141 | REPTYPE_MAX, REPTYPE_MIN, /* + and +? */ |
142 | REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */ |
143 | REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */ |
144 | REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */ |
145 | REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */ |
146 | |
147 | /* Numbers for RMATCH calls at backtracking points. When these lists are |
148 | changed, the code at RETURN_SWITCH below must be updated in sync. */ |
149 | |
150 | enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, |
151 | RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, |
152 | RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, |
153 | RM31, RM32, RM33, RM34, RM35, RM36 }; |
154 | |
155 | #ifdef SUPPORT_WIDE_CHARS |
156 | enum { RM100=100, RM101 }; |
157 | #endif |
158 | |
159 | #ifdef SUPPORT_UNICODE |
160 | enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207, |
161 | RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215, |
162 | RM216, RM217, RM218, RM219, RM220, RM221, RM222, RM223, |
163 | RM224, RM225 }; |
164 | #endif |
165 | |
166 | /* Define short names for general fields in the current backtrack frame, which |
167 | is always pointed to by the F variable. Occasional references to fields in |
168 | other frames are written out explicitly. There are also some fields in the |
169 | current frame whose names start with "temp" that are used for short-term, |
170 | localised backtracking memory. These are #defined with Lxxx names at the point |
171 | of use and undefined afterwards. */ |
172 | |
173 | #define Fback_frame F->back_frame |
174 | #define Fcapture_last F->capture_last |
175 | #define Fcurrent_recurse F->current_recurse |
176 | #define Fecode F->ecode |
177 | #define Feptr F->eptr |
178 | #define Fgroup_frame_type F->group_frame_type |
179 | #define Flast_group_offset F->last_group_offset |
180 | #define Flength F->length |
181 | #define Fmark F->mark |
182 | #define Frdepth F->rdepth |
183 | #define Fstart_match F->start_match |
184 | #define Foffset_top F->offset_top |
185 | #define Foccu F->occu |
186 | #define Fop F->op |
187 | #define Fovector F->ovector |
188 | #define Freturn_id F->return_id |
189 | |
190 | |
191 | #ifdef DEBUG_FRAMES_DISPLAY |
192 | /************************************************* |
193 | * Display current frames and contents * |
194 | *************************************************/ |
195 | |
196 | /* This debugging function displays the current set of frames and their |
197 | contents. It is not called automatically from anywhere, the intention being |
198 | that calls can be inserted where necessary when debugging frame-related |
199 | problems. |
200 | |
201 | Arguments: |
202 | f the file to write to |
203 | F the current top frame |
204 | P a previous frame of interest |
205 | frame_size the frame size |
206 | mb points to the match block |
207 | match_data points to the match data block |
208 | s identification text |
209 | |
210 | Returns: nothing |
211 | */ |
212 | |
213 | static void |
214 | display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size, |
215 | match_block *mb, pcre2_match_data *match_data, const char *s, ...) |
216 | { |
217 | uint32_t i; |
218 | heapframe *Q; |
219 | va_list ap; |
220 | va_start(ap, s); |
221 | |
222 | fprintf(f, "FRAMES " ); |
223 | vfprintf(f, s, ap); |
224 | va_end(ap); |
225 | |
226 | if (P != NULL) fprintf(f, " P=%lu" , |
227 | ((char *)P - (char *)(match_data->heapframes))/frame_size); |
228 | fprintf(f, "\n" ); |
229 | |
230 | for (i = 0, Q = match_data->heapframes; |
231 | Q <= F; |
232 | i++, Q = (heapframe *)((char *)Q + frame_size)) |
233 | { |
234 | fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d" , |
235 | i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode), |
236 | Q->back_frame, Q->return_id); |
237 | |
238 | if (Q->last_group_offset == PCRE2_UNSET) |
239 | fprintf(f, " lgoffset=unset\n" ); |
240 | else |
241 | fprintf(f, " lgoffset=%lu\n" , Q->last_group_offset/frame_size); |
242 | } |
243 | } |
244 | |
245 | #endif |
246 | |
247 | |
248 | |
249 | /************************************************* |
250 | * Process a callout * |
251 | *************************************************/ |
252 | |
253 | /* This function is called for all callouts, whether "standalone" or at the |
254 | start of a conditional group. Feptr will be pointing to either OP_CALLOUT or |
255 | OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized |
256 | with fixed values. |
257 | |
258 | Arguments: |
259 | F points to the current backtracking frame |
260 | mb points to the match block |
261 | lengthptr where to return the length of the callout item |
262 | |
263 | Returns: the return from the callout |
264 | or 0 if no callout function exists |
265 | */ |
266 | |
267 | static int |
268 | do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr) |
269 | { |
270 | int rc; |
271 | PCRE2_SIZE save0, save1; |
272 | PCRE2_SIZE *callout_ovector; |
273 | pcre2_callout_block *cb; |
274 | |
275 | *lengthptr = (*Fecode == OP_CALLOUT)? |
276 | PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE); |
277 | |
278 | if (mb->callout == NULL) return 0; /* No callout function provided */ |
279 | |
280 | /* The original matching code (pre 10.30) worked directly with the ovector |
281 | passed by the user, and this was passed to callouts. Now that the working |
282 | ovector is in the backtracking frame, it no longer needs to reserve space for |
283 | the overall match offsets (which would waste space in the frame). For backward |
284 | compatibility, however, we pass capture_top and offset_vector to the callout as |
285 | if for the extended ovector, and we ensure that the first two slots are unset |
286 | by preserving and restoring their current contents. Picky compilers complain if |
287 | references such as Fovector[-2] are use directly, so we set up a separate |
288 | pointer. */ |
289 | |
290 | callout_ovector = (PCRE2_SIZE *)(Fovector) - 2; |
291 | |
292 | /* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields |
293 | are set externally. The first 3 never change; the last is updated for each |
294 | bumpalong. */ |
295 | |
296 | cb = mb->cb; |
297 | cb->capture_top = (uint32_t)Foffset_top/2 + 1; |
298 | cb->capture_last = Fcapture_last; |
299 | cb->offset_vector = callout_ovector; |
300 | cb->mark = mb->nomatch_mark; |
301 | cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject); |
302 | cb->pattern_position = GET(Fecode, 1); |
303 | cb->next_item_length = GET(Fecode, 1 + LINK_SIZE); |
304 | |
305 | if (*Fecode == OP_CALLOUT) /* Numerical callout */ |
306 | { |
307 | cb->callout_number = Fecode[1 + 2*LINK_SIZE]; |
308 | cb->callout_string_offset = 0; |
309 | cb->callout_string = NULL; |
310 | cb->callout_string_length = 0; |
311 | } |
312 | else /* String callout */ |
313 | { |
314 | cb->callout_number = 0; |
315 | cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE); |
316 | cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1; |
317 | cb->callout_string_length = |
318 | *lengthptr - (1 + 4*LINK_SIZE) - 2; |
319 | } |
320 | |
321 | save0 = callout_ovector[0]; |
322 | save1 = callout_ovector[1]; |
323 | callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET; |
324 | rc = mb->callout(cb, mb->callout_data); |
325 | callout_ovector[0] = save0; |
326 | callout_ovector[1] = save1; |
327 | cb->callout_flags = 0; |
328 | return rc; |
329 | } |
330 | |
331 | |
332 | |
333 | /************************************************* |
334 | * Match a back-reference * |
335 | *************************************************/ |
336 | |
337 | /* This function is called only when it is known that the offset lies within |
338 | the offsets that have so far been used in the match. Note that in caseless |
339 | UTF-8 mode, the number of subject bytes matched may be different to the number |
340 | of reference bytes. (In theory this could also happen in UTF-16 mode, but it |
341 | seems unlikely.) |
342 | |
343 | Arguments: |
344 | offset index into the offset vector |
345 | caseless TRUE if caseless |
346 | F the current backtracking frame pointer |
347 | mb points to match block |
348 | lengthptr pointer for returning the length matched |
349 | |
350 | Returns: = 0 sucessful match; number of code units matched is set |
351 | < 0 no match |
352 | > 0 partial match |
353 | */ |
354 | |
355 | static int |
356 | match_ref(PCRE2_SIZE offset, BOOL caseless, heapframe *F, match_block *mb, |
357 | PCRE2_SIZE *lengthptr) |
358 | { |
359 | PCRE2_SPTR p; |
360 | PCRE2_SIZE length; |
361 | PCRE2_SPTR eptr; |
362 | PCRE2_SPTR eptr_start; |
363 | |
364 | /* Deal with an unset group. The default is no match, but there is an option to |
365 | match an empty string. */ |
366 | |
367 | if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET) |
368 | { |
369 | if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0) |
370 | { |
371 | *lengthptr = 0; |
372 | return 0; /* Match */ |
373 | } |
374 | else return -1; /* No match */ |
375 | } |
376 | |
377 | /* Separate the caseless and UTF cases for speed. */ |
378 | |
379 | eptr = eptr_start = Feptr; |
380 | p = mb->start_subject + Fovector[offset]; |
381 | length = Fovector[offset+1] - Fovector[offset]; |
382 | |
383 | if (caseless) |
384 | { |
385 | #if defined SUPPORT_UNICODE |
386 | BOOL utf = (mb->poptions & PCRE2_UTF) != 0; |
387 | |
388 | if (utf || (mb->poptions & PCRE2_UCP) != 0) |
389 | { |
390 | PCRE2_SPTR endptr = p + length; |
391 | |
392 | /* Match characters up to the end of the reference. NOTE: the number of |
393 | code units matched may differ, because in UTF-8 there are some characters |
394 | whose upper and lower case codes have different numbers of bytes. For |
395 | example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3 |
396 | bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a |
397 | sequence of two of the latter. It is important, therefore, to check the |
398 | length along the reference, not along the subject (earlier code did this |
399 | wrong). UCP without uses Unicode properties but without UTF encoding. */ |
400 | |
401 | while (p < endptr) |
402 | { |
403 | uint32_t c, d; |
404 | const ucd_record *ur; |
405 | if (eptr >= mb->end_subject) return 1; /* Partial match */ |
406 | |
407 | if (utf) |
408 | { |
409 | GETCHARINC(c, eptr); |
410 | GETCHARINC(d, p); |
411 | } |
412 | else |
413 | { |
414 | c = *eptr++; |
415 | d = *p++; |
416 | } |
417 | |
418 | ur = GET_UCD(d); |
419 | if (c != d && c != (uint32_t)((int)d + ur->other_case)) |
420 | { |
421 | const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset; |
422 | for (;;) |
423 | { |
424 | if (c < *pp) return -1; /* No match */ |
425 | if (c == *pp++) break; |
426 | } |
427 | } |
428 | } |
429 | } |
430 | else |
431 | #endif |
432 | |
433 | /* Not in UTF or UCP mode */ |
434 | { |
435 | for (; length > 0; length--) |
436 | { |
437 | uint32_t cc, cp; |
438 | if (eptr >= mb->end_subject) return 1; /* Partial match */ |
439 | cc = UCHAR21TEST(eptr); |
440 | cp = UCHAR21TEST(p); |
441 | if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc)) |
442 | return -1; /* No match */ |
443 | p++; |
444 | eptr++; |
445 | } |
446 | } |
447 | } |
448 | |
449 | /* In the caseful case, we can just compare the code units, whether or not we |
450 | are in UTF and/or UCP mode. When partial matching, we have to do this unit by |
451 | unit. */ |
452 | |
453 | else |
454 | { |
455 | if (mb->partial != 0) |
456 | { |
457 | for (; length > 0; length--) |
458 | { |
459 | if (eptr >= mb->end_subject) return 1; /* Partial match */ |
460 | if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */ |
461 | } |
462 | } |
463 | |
464 | /* Not partial matching */ |
465 | |
466 | else |
467 | { |
468 | if ((PCRE2_SIZE)(mb->end_subject - eptr) < length) return 1; /* Partial */ |
469 | if (memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */ |
470 | eptr += length; |
471 | } |
472 | } |
473 | |
474 | *lengthptr = eptr - eptr_start; |
475 | return 0; /* Match */ |
476 | } |
477 | |
478 | |
479 | |
480 | /****************************************************************************** |
481 | ******************************************************************************* |
482 | "Recursion" in the match() function |
483 | |
484 | The original match() function was highly recursive, but this proved to be the |
485 | source of a number of problems over the years, mostly because of the relatively |
486 | small system stacks that are commonly found. As new features were added to |
487 | patterns, various kludges were invented to reduce the amount of stack used, |
488 | making the code hard to understand in places. |
489 | |
490 | A version did exist that used individual frames on the heap instead of calling |
491 | match() recursively, but this ran substantially slower. The current version is |
492 | a refactoring that uses a vector of frames to remember backtracking points. |
493 | This runs no slower, and possibly even a bit faster than the original recursive |
494 | implementation. |
495 | |
496 | At first, an initial vector of size START_FRAMES_SIZE (enough for maybe 50 |
497 | frames) was allocated on the system stack. If this was not big enough, the heap |
498 | was used for a larger vector. However, it turns out that there are environments |
499 | where taking as little as 20KiB from the system stack is an embarrassment. |
500 | After another refactoring, the heap is used exclusively, but a pointer the |
501 | frames vector and its size are cached in the match_data block, so that there is |
502 | no new memory allocation if the same match_data block is used for multiple |
503 | matches (unless the frames vector has to be extended). |
504 | ******************************************************************************* |
505 | ******************************************************************************/ |
506 | |
507 | |
508 | |
509 | |
510 | /************************************************* |
511 | * Macros for the match() function * |
512 | *************************************************/ |
513 | |
514 | /* These macros pack up tests that are used for partial matching several times |
515 | in the code. The second one is used when we already know we are past the end of |
516 | the subject. We set the "hit end" flag if the pointer is at the end of the |
517 | subject and either (a) the pointer is past the earliest inspected character |
518 | (i.e. something has been matched, even if not part of the actual matched |
519 | string), or (b) the pattern contains a lookbehind. These are the conditions for |
520 | which adding more characters may allow the current match to continue. |
521 | |
522 | For hard partial matching, we immediately return a partial match. Otherwise, |
523 | carrying on means that a complete match on the current subject will be sought. |
524 | A partial match is returned only if no complete match can be found. */ |
525 | |
526 | #define CHECK_PARTIAL()\ |
527 | if (Feptr >= mb->end_subject) \ |
528 | { \ |
529 | SCHECK_PARTIAL(); \ |
530 | } |
531 | |
532 | #define SCHECK_PARTIAL()\ |
533 | if (mb->partial != 0 && \ |
534 | (Feptr > mb->start_used_ptr || mb->allowemptypartial)) \ |
535 | { \ |
536 | mb->hitend = TRUE; \ |
537 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \ |
538 | } |
539 | |
540 | |
541 | /* These macros are used to implement backtracking. They simulate a recursive |
542 | call to the match() function by means of a local vector of frames which |
543 | remember the backtracking points. */ |
544 | |
545 | #define RMATCH(ra,rb)\ |
546 | {\ |
547 | start_ecode = ra;\ |
548 | Freturn_id = rb;\ |
549 | goto MATCH_RECURSE;\ |
550 | L_##rb:;\ |
551 | } |
552 | |
553 | #define RRETURN(ra)\ |
554 | {\ |
555 | rrc = ra;\ |
556 | goto RETURN_SWITCH;\ |
557 | } |
558 | |
559 | |
560 | |
561 | /************************************************* |
562 | * Match from current position * |
563 | *************************************************/ |
564 | |
565 | /* This function is called to run one match attempt at a single starting point |
566 | in the subject. |
567 | |
568 | Performance note: It might be tempting to extract commonly used fields from the |
569 | mb structure (e.g. end_subject) into individual variables to improve |
570 | performance. Tests using gcc on a SPARC disproved this; in the first case, it |
571 | made performance worse. |
572 | |
573 | Arguments: |
574 | start_eptr starting character in subject |
575 | start_ecode starting position in compiled code |
576 | top_bracket number of capturing parentheses in the pattern |
577 | frame_size size of each backtracking frame |
578 | match_data pointer to the match_data block |
579 | mb pointer to "static" variables block |
580 | |
581 | Returns: MATCH_MATCH if matched ) these values are >= 0 |
582 | MATCH_NOMATCH if failed to match ) |
583 | negative MATCH_xxx value for PRUNE, SKIP, etc |
584 | negative PCRE2_ERROR_xxx value if aborted by an error condition |
585 | (e.g. stopped by repeated call or depth limit) |
586 | */ |
587 | |
588 | static int |
589 | match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, |
590 | PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb) |
591 | { |
592 | /* Frame-handling variables */ |
593 | |
594 | heapframe *F; /* Current frame pointer */ |
595 | heapframe *N = NULL; /* Temporary frame pointers */ |
596 | heapframe *P = NULL; |
597 | |
598 | heapframe *frames_top; /* End of frames vector */ |
599 | heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */ |
600 | PCRE2_SIZE heapframes_size; /* Usable size of frames vector */ |
601 | PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */ |
602 | |
603 | /* Local variables that do not need to be preserved over calls to RRMATCH(). */ |
604 | |
605 | PCRE2_SPTR bracode; /* Temp pointer to start of group */ |
606 | PCRE2_SIZE offset; /* Used for group offsets */ |
607 | PCRE2_SIZE length; /* Used for various length calculations */ |
608 | |
609 | int rrc; /* Return from functions & backtracking "recursions" */ |
610 | #ifdef SUPPORT_UNICODE |
611 | int proptype; /* Type of character property */ |
612 | #endif |
613 | |
614 | uint32_t i; /* Used for local loops */ |
615 | uint32_t fc; /* Character values */ |
616 | uint32_t number; /* Used for group and other numbers */ |
617 | uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */ |
618 | uint32_t group_frame_type; /* Specifies type for new group frames */ |
619 | |
620 | BOOL condition; /* Used in conditional groups */ |
621 | BOOL cur_is_word; /* Used in "word" tests */ |
622 | BOOL prev_is_word; /* Used in "word" tests */ |
623 | |
624 | /* UTF and UCP flags */ |
625 | |
626 | #ifdef SUPPORT_UNICODE |
627 | BOOL utf = (mb->poptions & PCRE2_UTF) != 0; |
628 | BOOL ucp = (mb->poptions & PCRE2_UCP) != 0; |
629 | #else |
630 | BOOL utf = FALSE; /* Required for convenience even when no Unicode support */ |
631 | #endif |
632 | |
633 | /* This is the length of the last part of a backtracking frame that must be |
634 | copied when a new frame is created. */ |
635 | |
636 | frame_copy_size = frame_size - offsetof(heapframe, eptr); |
637 | |
638 | /* Set up the first frame and the end of the frames vector. We set the local |
639 | heapframes_size to the usuable amount of the vector, that is, a whole number of |
640 | frames. */ |
641 | |
642 | F = match_data->heapframes; |
643 | heapframes_size = (match_data->heapframes_size / frame_size) * frame_size; |
644 | frames_top = (heapframe *)((char *)F + heapframes_size); |
645 | |
646 | Frdepth = 0; /* "Recursion" depth */ |
647 | Fcapture_last = 0; /* Number of most recent capture */ |
648 | Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */ |
649 | Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */ |
650 | Fmark = NULL; /* Most recent mark */ |
651 | Foffset_top = 0; /* End of captures within the frame */ |
652 | Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */ |
653 | group_frame_type = 0; /* Not a start of group frame */ |
654 | goto NEW_FRAME; /* Start processing with this frame */ |
655 | |
656 | /* Come back here when we want to create a new frame for remembering a |
657 | backtracking point. */ |
658 | |
659 | MATCH_RECURSE: |
660 | |
661 | /* Set up a new backtracking frame. If the vector is full, get a new one, |
662 | doubling the size, but constrained by the heap limit (which is in KiB). */ |
663 | |
664 | N = (heapframe *)((char *)F + frame_size); |
665 | if (N >= frames_top) |
666 | { |
667 | heapframe *new; |
668 | PCRE2_SIZE newsize = match_data->heapframes_size * 2; |
669 | |
670 | if (newsize > mb->heap_limit) |
671 | { |
672 | PCRE2_SIZE maxsize = (mb->heap_limit/frame_size) * frame_size; |
673 | if (match_data->heapframes_size >= maxsize) return PCRE2_ERROR_HEAPLIMIT; |
674 | newsize = maxsize; |
675 | } |
676 | |
677 | new = match_data->memctl.malloc(newsize, match_data->memctl.memory_data); |
678 | if (new == NULL) return PCRE2_ERROR_NOMEMORY; |
679 | memcpy(new, match_data->heapframes, heapframes_size); |
680 | |
681 | F = (heapframe *)((char *)new + ((char *)F - (char *)match_data->heapframes)); |
682 | N = (heapframe *)((char *)F + frame_size); |
683 | |
684 | match_data->memctl.free(match_data->heapframes, match_data->memctl.memory_data); |
685 | match_data->heapframes = new; |
686 | match_data->heapframes_size = newsize; |
687 | |
688 | heapframes_size = (newsize / frame_size) * frame_size; |
689 | frames_top = (heapframe *)((char *)new + heapframes_size); |
690 | } |
691 | |
692 | #ifdef DEBUG_SHOW_RMATCH |
693 | fprintf(stderr, "++ RMATCH %2d frame=%d" , Freturn_id, Frdepth + 1); |
694 | if (group_frame_type != 0) |
695 | { |
696 | fprintf(stderr, " type=%x " , group_frame_type); |
697 | switch (GF_IDMASK(group_frame_type)) |
698 | { |
699 | case GF_CAPTURE: |
700 | fprintf(stderr, "capture=%d" , GF_DATAMASK(group_frame_type)); |
701 | break; |
702 | |
703 | case GF_NOCAPTURE: |
704 | fprintf(stderr, "nocapture op=%d" , GF_DATAMASK(group_frame_type)); |
705 | break; |
706 | |
707 | case GF_CONDASSERT: |
708 | fprintf(stderr, "condassert op=%d" , GF_DATAMASK(group_frame_type)); |
709 | break; |
710 | |
711 | case GF_RECURSE: |
712 | fprintf(stderr, "recurse=%d" , GF_DATAMASK(group_frame_type)); |
713 | break; |
714 | |
715 | default: |
716 | fprintf(stderr, "*** unknown ***" ); |
717 | break; |
718 | } |
719 | } |
720 | fprintf(stderr, "\n" ); |
721 | #endif |
722 | |
723 | /* Copy those fields that must be copied into the new frame, increase the |
724 | "recursion" depth (i.e. the new frame's index) and then make the new frame |
725 | current. */ |
726 | |
727 | memcpy((char *)N + offsetof(heapframe, eptr), |
728 | (char *)F + offsetof(heapframe, eptr), |
729 | frame_copy_size); |
730 | |
731 | N->rdepth = Frdepth + 1; |
732 | F = N; |
733 | |
734 | /* Carry on processing with a new frame. */ |
735 | |
736 | NEW_FRAME: |
737 | Fgroup_frame_type = group_frame_type; |
738 | Fecode = start_ecode; /* Starting code pointer */ |
739 | Fback_frame = frame_size; /* Default is go back one frame */ |
740 | |
741 | /* If this is a special type of group frame, remember its offset for quick |
742 | access at the end of the group. If this is a recursion, set a new current |
743 | recursion value. */ |
744 | |
745 | if (group_frame_type != 0) |
746 | { |
747 | Flast_group_offset = (char *)F - (char *)match_data->heapframes; |
748 | if (GF_IDMASK(group_frame_type) == GF_RECURSE) |
749 | Fcurrent_recurse = GF_DATAMASK(group_frame_type); |
750 | group_frame_type = 0; |
751 | } |
752 | |
753 | |
754 | /* ========================================================================= */ |
755 | /* This is the main processing loop. First check that we haven't recorded too |
756 | many backtracks (search tree is too large), or that we haven't exceeded the |
757 | recursive depth limit (used too many backtracking frames). If not, process the |
758 | opcodes. */ |
759 | |
760 | if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT; |
761 | if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT; |
762 | |
763 | for (;;) |
764 | { |
765 | #ifdef DEBUG_SHOW_OPS |
766 | fprintf(stderr, "++ op=%d\n" , *Fecode); |
767 | #endif |
768 | |
769 | Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */ |
770 | switch(Fop) |
771 | { |
772 | /* ===================================================================== */ |
773 | /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close |
774 | any currently open capturing brackets. Unlike reaching the end of a group, |
775 | where we know the starting frame is at the top of the chained frames, in |
776 | this case we have to search back for the relevant frame in case other types |
777 | of group that use chained frames have intervened. Multiple OP_CLOSEs always |
778 | come innermost first, which matches the chain order. We can ignore this in |
779 | a recursion, because captures are not passed out of recursions. */ |
780 | |
781 | case OP_CLOSE: |
782 | if (Fcurrent_recurse == RECURSE_UNSET) |
783 | { |
784 | number = GET2(Fecode, 1); |
785 | offset = Flast_group_offset; |
786 | for(;;) |
787 | { |
788 | if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL; |
789 | N = (heapframe *)((char *)match_data->heapframes + offset); |
790 | P = (heapframe *)((char *)N - frame_size); |
791 | if (N->group_frame_type == (GF_CAPTURE | number)) break; |
792 | offset = P->last_group_offset; |
793 | } |
794 | offset = (number << 1) - 2; |
795 | Fcapture_last = number; |
796 | Fovector[offset] = P->eptr - mb->start_subject; |
797 | Fovector[offset+1] = Feptr - mb->start_subject; |
798 | if (offset >= Foffset_top) Foffset_top = offset + 2; |
799 | } |
800 | Fecode += PRIV(OP_lengths)[*Fecode]; |
801 | break; |
802 | |
803 | |
804 | /* ===================================================================== */ |
805 | /* Real or forced end of the pattern, assertion, or recursion. In an |
806 | assertion ACCEPT, update the last used pointer and remember the current |
807 | frame so that the captures and mark can be fished out of it. */ |
808 | |
809 | case OP_ASSERT_ACCEPT: |
810 | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
811 | assert_accept_frame = F; |
812 | RRETURN(MATCH_ACCEPT); |
813 | |
814 | /* If recursing, we have to find the most recent recursion. */ |
815 | |
816 | case OP_ACCEPT: |
817 | case OP_END: |
818 | |
819 | /* Handle end of a recursion. */ |
820 | |
821 | if (Fcurrent_recurse != RECURSE_UNSET) |
822 | { |
823 | offset = Flast_group_offset; |
824 | for(;;) |
825 | { |
826 | if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL; |
827 | N = (heapframe *)((char *)match_data->heapframes + offset); |
828 | P = (heapframe *)((char *)N - frame_size); |
829 | if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break; |
830 | offset = P->last_group_offset; |
831 | } |
832 | |
833 | /* N is now the frame of the recursion; the previous frame is at the |
834 | OP_RECURSE position. Go back there, copying the current subject position |
835 | and mark, and the start_match position (\K might have changed it), and |
836 | then move on past the OP_RECURSE. */ |
837 | |
838 | P->eptr = Feptr; |
839 | P->mark = Fmark; |
840 | P->start_match = Fstart_match; |
841 | F = P; |
842 | Fecode += 1 + LINK_SIZE; |
843 | continue; |
844 | } |
845 | |
846 | /* Not a recursion. Fail for an empty string match if either PCRE2_NOTEMPTY |
847 | is set, or if PCRE2_NOTEMPTY_ATSTART is set and we have matched at the |
848 | start of the subject. In both cases, backtracking will then try other |
849 | alternatives, if any. */ |
850 | |
851 | if (Feptr == Fstart_match && |
852 | ((mb->moptions & PCRE2_NOTEMPTY) != 0 || |
853 | ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 && |
854 | Fstart_match == mb->start_subject + mb->start_offset))) |
855 | RRETURN(MATCH_NOMATCH); |
856 | |
857 | /* Also fail if PCRE2_ENDANCHORED is set and the end of the match is not |
858 | the end of the subject. After (*ACCEPT) we fail the entire match (at this |
859 | position) but backtrack on reaching the end of the pattern. */ |
860 | |
861 | if (Feptr < mb->end_subject && |
862 | ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0) |
863 | { |
864 | if (Fop == OP_END) RRETURN(MATCH_NOMATCH); |
865 | return MATCH_NOMATCH; |
866 | } |
867 | |
868 | /* We have a successful match of the whole pattern. Record the result and |
869 | then do a direct return from the function. If there is space in the offset |
870 | vector, set any pairs that follow the highest-numbered captured string but |
871 | are less than the number of capturing groups in the pattern to PCRE2_UNSET. |
872 | It is documented that this happens. "Gaps" are set to PCRE2_UNSET |
873 | dynamically. It is only those at the end that need setting here. */ |
874 | |
875 | mb->end_match_ptr = Feptr; /* Record where we ended */ |
876 | mb->end_offset_top = Foffset_top; /* and how many extracts were taken */ |
877 | mb->mark = Fmark; /* and the last success mark */ |
878 | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
879 | |
880 | match_data->ovector[0] = Fstart_match - mb->start_subject; |
881 | match_data->ovector[1] = Feptr - mb->start_subject; |
882 | |
883 | /* Set i to the smaller of the sizes of the external and frame ovectors. */ |
884 | |
885 | i = 2 * ((top_bracket + 1 > match_data->oveccount)? |
886 | match_data->oveccount : top_bracket + 1); |
887 | memcpy(match_data->ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE)); |
888 | while (--i >= Foffset_top + 2) match_data->ovector[i] = PCRE2_UNSET; |
889 | return MATCH_MATCH; /* Note: NOT RRETURN */ |
890 | |
891 | |
892 | /*===================================================================== */ |
893 | /* Match any single character type except newline; have to take care with |
894 | CRLF newlines and partial matching. */ |
895 | |
896 | case OP_ANY: |
897 | if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); |
898 | if (mb->partial != 0 && |
899 | Feptr == mb->end_subject - 1 && |
900 | NLBLOCK->nltype == NLTYPE_FIXED && |
901 | NLBLOCK->nllen == 2 && |
902 | UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) |
903 | { |
904 | mb->hitend = TRUE; |
905 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
906 | } |
907 | /* Fall through */ |
908 | |
909 | /* Match any single character whatsoever. */ |
910 | |
911 | case OP_ALLANY: |
912 | if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */ |
913 | { /* not be updated before SCHECK_PARTIAL. */ |
914 | SCHECK_PARTIAL(); |
915 | RRETURN(MATCH_NOMATCH); |
916 | } |
917 | Feptr++; |
918 | #ifdef SUPPORT_UNICODE |
919 | if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
920 | #endif |
921 | Fecode++; |
922 | break; |
923 | |
924 | |
925 | /* ===================================================================== */ |
926 | /* Match a single code unit, even in UTF mode. This opcode really does |
927 | match any code unit, even newline. (It really should be called ANYCODEUNIT, |
928 | of course - the byte name is from pre-16 bit days.) */ |
929 | |
930 | case OP_ANYBYTE: |
931 | if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */ |
932 | { /* not be updated before SCHECK_PARTIAL. */ |
933 | SCHECK_PARTIAL(); |
934 | RRETURN(MATCH_NOMATCH); |
935 | } |
936 | Feptr++; |
937 | Fecode++; |
938 | break; |
939 | |
940 | |
941 | /* ===================================================================== */ |
942 | /* Match a single character, casefully */ |
943 | |
944 | case OP_CHAR: |
945 | #ifdef SUPPORT_UNICODE |
946 | if (utf) |
947 | { |
948 | Flength = 1; |
949 | Fecode++; |
950 | GETCHARLEN(fc, Fecode, Flength); |
951 | if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr)) |
952 | { |
953 | CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ |
954 | RRETURN(MATCH_NOMATCH); |
955 | } |
956 | for (; Flength > 0; Flength--) |
957 | { |
958 | if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH); |
959 | } |
960 | } |
961 | else |
962 | #endif |
963 | |
964 | /* Not UTF mode */ |
965 | { |
966 | if (mb->end_subject - Feptr < 1) |
967 | { |
968 | SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ |
969 | RRETURN(MATCH_NOMATCH); |
970 | } |
971 | if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH); |
972 | Fecode += 2; |
973 | } |
974 | break; |
975 | |
976 | |
977 | /* ===================================================================== */ |
978 | /* Match a single character, caselessly. If we are at the end of the |
979 | subject, give up immediately. We get here only when the pattern character |
980 | has at most one other case. Characters with more than two cases are coded |
981 | as OP_PROP with the pseudo-property PT_CLIST. */ |
982 | |
983 | case OP_CHARI: |
984 | if (Feptr >= mb->end_subject) |
985 | { |
986 | SCHECK_PARTIAL(); |
987 | RRETURN(MATCH_NOMATCH); |
988 | } |
989 | |
990 | #ifdef SUPPORT_UNICODE |
991 | if (utf) |
992 | { |
993 | Flength = 1; |
994 | Fecode++; |
995 | GETCHARLEN(fc, Fecode, Flength); |
996 | |
997 | /* If the pattern character's value is < 128, we know that its other case |
998 | (if any) is also < 128 (and therefore only one code unit long in all |
999 | code-unit widths), so we can use the fast lookup table. We checked above |
1000 | that there is at least one character left in the subject. */ |
1001 | |
1002 | if (fc < 128) |
1003 | { |
1004 | uint32_t cc = UCHAR21(Feptr); |
1005 | if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH); |
1006 | Fecode++; |
1007 | Feptr++; |
1008 | } |
1009 | |
1010 | /* Otherwise we must pick up the subject character and use Unicode |
1011 | property support to test its other case. Note that we cannot use the |
1012 | value of "Flength" to check for sufficient bytes left, because the other |
1013 | case of the character may have more or fewer code units. */ |
1014 | |
1015 | else |
1016 | { |
1017 | uint32_t dc; |
1018 | GETCHARINC(dc, Feptr); |
1019 | Fecode += Flength; |
1020 | if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); |
1021 | } |
1022 | } |
1023 | |
1024 | /* If UCP is set without UTF we must do the same as above, but with one |
1025 | character per code unit. */ |
1026 | |
1027 | else if (ucp) |
1028 | { |
1029 | uint32_t cc = UCHAR21(Feptr); |
1030 | fc = Fecode[1]; |
1031 | if (fc < 128) |
1032 | { |
1033 | if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH); |
1034 | } |
1035 | else |
1036 | { |
1037 | if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); |
1038 | } |
1039 | Feptr++; |
1040 | Fecode += 2; |
1041 | } |
1042 | |
1043 | else |
1044 | #endif /* SUPPORT_UNICODE */ |
1045 | |
1046 | /* Not UTF or UCP mode; use the table for characters < 256. */ |
1047 | { |
1048 | if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1]) |
1049 | != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH); |
1050 | Feptr++; |
1051 | Fecode += 2; |
1052 | } |
1053 | break; |
1054 | |
1055 | |
1056 | /* ===================================================================== */ |
1057 | /* Match not a single character. */ |
1058 | |
1059 | case OP_NOT: |
1060 | case OP_NOTI: |
1061 | if (Feptr >= mb->end_subject) |
1062 | { |
1063 | SCHECK_PARTIAL(); |
1064 | RRETURN(MATCH_NOMATCH); |
1065 | } |
1066 | |
1067 | #ifdef SUPPORT_UNICODE |
1068 | if (utf) |
1069 | { |
1070 | uint32_t ch; |
1071 | Fecode++; |
1072 | GETCHARINC(ch, Fecode); |
1073 | GETCHARINC(fc, Feptr); |
1074 | if (ch == fc) |
1075 | { |
1076 | RRETURN(MATCH_NOMATCH); /* Caseful match */ |
1077 | } |
1078 | else if (Fop == OP_NOTI) /* If caseless */ |
1079 | { |
1080 | if (ch > 127) |
1081 | ch = UCD_OTHERCASE(ch); |
1082 | else |
1083 | ch = (mb->fcc)[ch]; |
1084 | if (ch == fc) RRETURN(MATCH_NOMATCH); |
1085 | } |
1086 | } |
1087 | |
1088 | /* UCP without UTF is as above, but with one character per code unit. */ |
1089 | |
1090 | else if (ucp) |
1091 | { |
1092 | uint32_t ch; |
1093 | fc = UCHAR21INC(Feptr); |
1094 | ch = Fecode[1]; |
1095 | Fecode += 2; |
1096 | |
1097 | if (ch == fc) |
1098 | { |
1099 | RRETURN(MATCH_NOMATCH); /* Caseful match */ |
1100 | } |
1101 | else if (Fop == OP_NOTI) /* If caseless */ |
1102 | { |
1103 | if (ch > 127) |
1104 | ch = UCD_OTHERCASE(ch); |
1105 | else |
1106 | ch = (mb->fcc)[ch]; |
1107 | if (ch == fc) RRETURN(MATCH_NOMATCH); |
1108 | } |
1109 | } |
1110 | |
1111 | else |
1112 | #endif /* SUPPORT_UNICODE */ |
1113 | |
1114 | /* Neither UTF nor UCP is set */ |
1115 | |
1116 | { |
1117 | uint32_t ch = Fecode[1]; |
1118 | fc = UCHAR21INC(Feptr); |
1119 | if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc)) |
1120 | RRETURN(MATCH_NOMATCH); |
1121 | Fecode += 2; |
1122 | } |
1123 | break; |
1124 | |
1125 | |
1126 | /* ===================================================================== */ |
1127 | /* Match a single character repeatedly. */ |
1128 | |
1129 | #define Loclength F->temp_size |
1130 | #define Lstart_eptr F->temp_sptr[0] |
1131 | #define Lcharptr F->temp_sptr[1] |
1132 | #define Lmin F->temp_32[0] |
1133 | #define Lmax F->temp_32[1] |
1134 | #define Lc F->temp_32[2] |
1135 | #define Loc F->temp_32[3] |
1136 | |
1137 | case OP_EXACT: |
1138 | case OP_EXACTI: |
1139 | Lmin = Lmax = GET2(Fecode, 1); |
1140 | Fecode += 1 + IMM2_SIZE; |
1141 | goto REPEATCHAR; |
1142 | |
1143 | case OP_POSUPTO: |
1144 | case OP_POSUPTOI: |
1145 | reptype = REPTYPE_POS; |
1146 | Lmin = 0; |
1147 | Lmax = GET2(Fecode, 1); |
1148 | Fecode += 1 + IMM2_SIZE; |
1149 | goto REPEATCHAR; |
1150 | |
1151 | case OP_UPTO: |
1152 | case OP_UPTOI: |
1153 | reptype = REPTYPE_MAX; |
1154 | Lmin = 0; |
1155 | Lmax = GET2(Fecode, 1); |
1156 | Fecode += 1 + IMM2_SIZE; |
1157 | goto REPEATCHAR; |
1158 | |
1159 | case OP_MINUPTO: |
1160 | case OP_MINUPTOI: |
1161 | reptype = REPTYPE_MIN; |
1162 | Lmin = 0; |
1163 | Lmax = GET2(Fecode, 1); |
1164 | Fecode += 1 + IMM2_SIZE; |
1165 | goto REPEATCHAR; |
1166 | |
1167 | case OP_POSSTAR: |
1168 | case OP_POSSTARI: |
1169 | reptype = REPTYPE_POS; |
1170 | Lmin = 0; |
1171 | Lmax = UINT32_MAX; |
1172 | Fecode++; |
1173 | goto REPEATCHAR; |
1174 | |
1175 | case OP_POSPLUS: |
1176 | case OP_POSPLUSI: |
1177 | reptype = REPTYPE_POS; |
1178 | Lmin = 1; |
1179 | Lmax = UINT32_MAX; |
1180 | Fecode++; |
1181 | goto REPEATCHAR; |
1182 | |
1183 | case OP_POSQUERY: |
1184 | case OP_POSQUERYI: |
1185 | reptype = REPTYPE_POS; |
1186 | Lmin = 0; |
1187 | Lmax = 1; |
1188 | Fecode++; |
1189 | goto REPEATCHAR; |
1190 | |
1191 | case OP_STAR: |
1192 | case OP_STARI: |
1193 | case OP_MINSTAR: |
1194 | case OP_MINSTARI: |
1195 | case OP_PLUS: |
1196 | case OP_PLUSI: |
1197 | case OP_MINPLUS: |
1198 | case OP_MINPLUSI: |
1199 | case OP_QUERY: |
1200 | case OP_QUERYI: |
1201 | case OP_MINQUERY: |
1202 | case OP_MINQUERYI: |
1203 | fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI); |
1204 | Lmin = rep_min[fc]; |
1205 | Lmax = rep_max[fc]; |
1206 | reptype = rep_typ[fc]; |
1207 | |
1208 | /* Common code for all repeated single-character matches. We first check |
1209 | for the minimum number of characters. If the minimum equals the maximum, we |
1210 | are done. Otherwise, if minimizing, check the rest of the pattern for a |
1211 | match; if there isn't one, advance up to the maximum, one character at a |
1212 | time. |
1213 | |
1214 | If maximizing, advance up to the maximum number of matching characters, |
1215 | until Feptr is past the end of the maximum run. If possessive, we are |
1216 | then done (no backing up). Otherwise, match at this position; anything |
1217 | other than no match is immediately returned. For nomatch, back up one |
1218 | character, unless we are matching \R and the last thing matched was |
1219 | \r\n, in which case, back up two code units until we reach the first |
1220 | optional character position. |
1221 | |
1222 | The various UTF/non-UTF and caseful/caseless cases are handled separately, |
1223 | for speed. */ |
1224 | |
1225 | REPEATCHAR: |
1226 | #ifdef SUPPORT_UNICODE |
1227 | if (utf) |
1228 | { |
1229 | Flength = 1; |
1230 | Lcharptr = Fecode; |
1231 | GETCHARLEN(fc, Fecode, Flength); |
1232 | Fecode += Flength; |
1233 | |
1234 | /* Handle multi-code-unit character matching, caseful and caseless. */ |
1235 | |
1236 | if (Flength > 1) |
1237 | { |
1238 | uint32_t othercase; |
1239 | |
1240 | if (Fop >= OP_STARI && /* Caseless */ |
1241 | (othercase = UCD_OTHERCASE(fc)) != fc) |
1242 | Loclength = PRIV(ord2utf)(othercase, Foccu); |
1243 | else Loclength = 0; |
1244 | |
1245 | for (i = 1; i <= Lmin; i++) |
1246 | { |
1247 | if (Feptr <= mb->end_subject - Flength && |
1248 | memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength; |
1249 | else if (Loclength > 0 && |
1250 | Feptr <= mb->end_subject - Loclength && |
1251 | memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) |
1252 | Feptr += Loclength; |
1253 | else |
1254 | { |
1255 | CHECK_PARTIAL(); |
1256 | RRETURN(MATCH_NOMATCH); |
1257 | } |
1258 | } |
1259 | |
1260 | if (Lmin == Lmax) continue; |
1261 | |
1262 | if (reptype == REPTYPE_MIN) |
1263 | { |
1264 | for (;;) |
1265 | { |
1266 | RMATCH(Fecode, RM202); |
1267 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1268 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1269 | if (Feptr <= mb->end_subject - Flength && |
1270 | memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength; |
1271 | else if (Loclength > 0 && |
1272 | Feptr <= mb->end_subject - Loclength && |
1273 | memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) |
1274 | Feptr += Loclength; |
1275 | else |
1276 | { |
1277 | CHECK_PARTIAL(); |
1278 | RRETURN(MATCH_NOMATCH); |
1279 | } |
1280 | } |
1281 | /* Control never gets here */ |
1282 | } |
1283 | |
1284 | else /* Maximize */ |
1285 | { |
1286 | Lstart_eptr = Feptr; |
1287 | for (i = Lmin; i < Lmax; i++) |
1288 | { |
1289 | if (Feptr <= mb->end_subject - Flength && |
1290 | memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) |
1291 | Feptr += Flength; |
1292 | else if (Loclength > 0 && |
1293 | Feptr <= mb->end_subject - Loclength && |
1294 | memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0) |
1295 | Feptr += Loclength; |
1296 | else |
1297 | { |
1298 | CHECK_PARTIAL(); |
1299 | break; |
1300 | } |
1301 | } |
1302 | |
1303 | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
1304 | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
1305 | go too far. */ |
1306 | |
1307 | if (reptype != REPTYPE_POS) for(;;) |
1308 | { |
1309 | if (Feptr <= Lstart_eptr) break; |
1310 | RMATCH(Fecode, RM203); |
1311 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1312 | Feptr--; |
1313 | BACKCHAR(Feptr); |
1314 | } |
1315 | } |
1316 | break; /* End of repeated wide character handling */ |
1317 | } |
1318 | |
1319 | /* Length of UTF character is 1. Put it into the preserved variable and |
1320 | fall through to the non-UTF code. */ |
1321 | |
1322 | Lc = fc; |
1323 | } |
1324 | else |
1325 | #endif /* SUPPORT_UNICODE */ |
1326 | |
1327 | /* When not in UTF mode, load a single-code-unit character. Then proceed as |
1328 | above, using Unicode casing if either UTF or UCP is set. */ |
1329 | |
1330 | Lc = *Fecode++; |
1331 | |
1332 | /* Caseless comparison */ |
1333 | |
1334 | if (Fop >= OP_STARI) |
1335 | { |
1336 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
1337 | #ifdef SUPPORT_UNICODE |
1338 | if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc); |
1339 | else |
1340 | #endif /* SUPPORT_UNICODE */ |
1341 | /* Lc will be < 128 in UTF-8 mode. */ |
1342 | Loc = mb->fcc[Lc]; |
1343 | #else /* 16-bit & 32-bit */ |
1344 | #ifdef SUPPORT_UNICODE |
1345 | if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc); |
1346 | else |
1347 | #endif /* SUPPORT_UNICODE */ |
1348 | Loc = TABLE_GET(Lc, mb->fcc, Lc); |
1349 | #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ |
1350 | |
1351 | for (i = 1; i <= Lmin; i++) |
1352 | { |
1353 | uint32_t cc; /* Faster than PCRE2_UCHAR */ |
1354 | if (Feptr >= mb->end_subject) |
1355 | { |
1356 | SCHECK_PARTIAL(); |
1357 | RRETURN(MATCH_NOMATCH); |
1358 | } |
1359 | cc = UCHAR21TEST(Feptr); |
1360 | if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH); |
1361 | Feptr++; |
1362 | } |
1363 | if (Lmin == Lmax) continue; |
1364 | |
1365 | if (reptype == REPTYPE_MIN) |
1366 | { |
1367 | for (;;) |
1368 | { |
1369 | uint32_t cc; /* Faster than PCRE2_UCHAR */ |
1370 | RMATCH(Fecode, RM25); |
1371 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1372 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1373 | if (Feptr >= mb->end_subject) |
1374 | { |
1375 | SCHECK_PARTIAL(); |
1376 | RRETURN(MATCH_NOMATCH); |
1377 | } |
1378 | cc = UCHAR21TEST(Feptr); |
1379 | if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH); |
1380 | Feptr++; |
1381 | } |
1382 | /* Control never gets here */ |
1383 | } |
1384 | |
1385 | else /* Maximize */ |
1386 | { |
1387 | Lstart_eptr = Feptr; |
1388 | for (i = Lmin; i < Lmax; i++) |
1389 | { |
1390 | uint32_t cc; /* Faster than PCRE2_UCHAR */ |
1391 | if (Feptr >= mb->end_subject) |
1392 | { |
1393 | SCHECK_PARTIAL(); |
1394 | break; |
1395 | } |
1396 | cc = UCHAR21TEST(Feptr); |
1397 | if (Lc != cc && Loc != cc) break; |
1398 | Feptr++; |
1399 | } |
1400 | if (reptype != REPTYPE_POS) for (;;) |
1401 | { |
1402 | if (Feptr == Lstart_eptr) break; |
1403 | RMATCH(Fecode, RM26); |
1404 | Feptr--; |
1405 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1406 | } |
1407 | } |
1408 | } |
1409 | |
1410 | /* Caseful comparisons (includes all multi-byte characters) */ |
1411 | |
1412 | else |
1413 | { |
1414 | for (i = 1; i <= Lmin; i++) |
1415 | { |
1416 | if (Feptr >= mb->end_subject) |
1417 | { |
1418 | SCHECK_PARTIAL(); |
1419 | RRETURN(MATCH_NOMATCH); |
1420 | } |
1421 | if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH); |
1422 | } |
1423 | |
1424 | if (Lmin == Lmax) continue; |
1425 | |
1426 | if (reptype == REPTYPE_MIN) |
1427 | { |
1428 | for (;;) |
1429 | { |
1430 | RMATCH(Fecode, RM27); |
1431 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1432 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1433 | if (Feptr >= mb->end_subject) |
1434 | { |
1435 | SCHECK_PARTIAL(); |
1436 | RRETURN(MATCH_NOMATCH); |
1437 | } |
1438 | if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH); |
1439 | } |
1440 | /* Control never gets here */ |
1441 | } |
1442 | else /* Maximize */ |
1443 | { |
1444 | Lstart_eptr = Feptr; |
1445 | for (i = Lmin; i < Lmax; i++) |
1446 | { |
1447 | if (Feptr >= mb->end_subject) |
1448 | { |
1449 | SCHECK_PARTIAL(); |
1450 | break; |
1451 | } |
1452 | |
1453 | if (Lc != UCHAR21TEST(Feptr)) break; |
1454 | Feptr++; |
1455 | } |
1456 | |
1457 | if (reptype != REPTYPE_POS) for (;;) |
1458 | { |
1459 | if (Feptr <= Lstart_eptr) break; |
1460 | RMATCH(Fecode, RM28); |
1461 | Feptr--; |
1462 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1463 | } |
1464 | } |
1465 | } |
1466 | break; |
1467 | |
1468 | #undef Loclength |
1469 | #undef Lstart_eptr |
1470 | #undef Lcharptr |
1471 | #undef Lmin |
1472 | #undef Lmax |
1473 | #undef Lc |
1474 | #undef Loc |
1475 | |
1476 | |
1477 | /* ===================================================================== */ |
1478 | /* Match a negated single one-byte character repeatedly. This is almost a |
1479 | repeat of the code for a repeated single character, but I haven't found a |
1480 | nice way of commoning these up that doesn't require a test of the |
1481 | positive/negative option for each character match. Maybe that wouldn't add |
1482 | very much to the time taken, but character matching *is* what this is all |
1483 | about... */ |
1484 | |
1485 | #define Lstart_eptr F->temp_sptr[0] |
1486 | #define Lmin F->temp_32[0] |
1487 | #define Lmax F->temp_32[1] |
1488 | #define Lc F->temp_32[2] |
1489 | #define Loc F->temp_32[3] |
1490 | |
1491 | case OP_NOTEXACT: |
1492 | case OP_NOTEXACTI: |
1493 | Lmin = Lmax = GET2(Fecode, 1); |
1494 | Fecode += 1 + IMM2_SIZE; |
1495 | goto REPEATNOTCHAR; |
1496 | |
1497 | case OP_NOTUPTO: |
1498 | case OP_NOTUPTOI: |
1499 | Lmin = 0; |
1500 | Lmax = GET2(Fecode, 1); |
1501 | reptype = REPTYPE_MAX; |
1502 | Fecode += 1 + IMM2_SIZE; |
1503 | goto REPEATNOTCHAR; |
1504 | |
1505 | case OP_NOTMINUPTO: |
1506 | case OP_NOTMINUPTOI: |
1507 | Lmin = 0; |
1508 | Lmax = GET2(Fecode, 1); |
1509 | reptype = REPTYPE_MIN; |
1510 | Fecode += 1 + IMM2_SIZE; |
1511 | goto REPEATNOTCHAR; |
1512 | |
1513 | case OP_NOTPOSSTAR: |
1514 | case OP_NOTPOSSTARI: |
1515 | reptype = REPTYPE_POS; |
1516 | Lmin = 0; |
1517 | Lmax = UINT32_MAX; |
1518 | Fecode++; |
1519 | goto REPEATNOTCHAR; |
1520 | |
1521 | case OP_NOTPOSPLUS: |
1522 | case OP_NOTPOSPLUSI: |
1523 | reptype = REPTYPE_POS; |
1524 | Lmin = 1; |
1525 | Lmax = UINT32_MAX; |
1526 | Fecode++; |
1527 | goto REPEATNOTCHAR; |
1528 | |
1529 | case OP_NOTPOSQUERY: |
1530 | case OP_NOTPOSQUERYI: |
1531 | reptype = REPTYPE_POS; |
1532 | Lmin = 0; |
1533 | Lmax = 1; |
1534 | Fecode++; |
1535 | goto REPEATNOTCHAR; |
1536 | |
1537 | case OP_NOTPOSUPTO: |
1538 | case OP_NOTPOSUPTOI: |
1539 | reptype = REPTYPE_POS; |
1540 | Lmin = 0; |
1541 | Lmax = GET2(Fecode, 1); |
1542 | Fecode += 1 + IMM2_SIZE; |
1543 | goto REPEATNOTCHAR; |
1544 | |
1545 | case OP_NOTSTAR: |
1546 | case OP_NOTSTARI: |
1547 | case OP_NOTMINSTAR: |
1548 | case OP_NOTMINSTARI: |
1549 | case OP_NOTPLUS: |
1550 | case OP_NOTPLUSI: |
1551 | case OP_NOTMINPLUS: |
1552 | case OP_NOTMINPLUSI: |
1553 | case OP_NOTQUERY: |
1554 | case OP_NOTQUERYI: |
1555 | case OP_NOTMINQUERY: |
1556 | case OP_NOTMINQUERYI: |
1557 | fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR); |
1558 | Lmin = rep_min[fc]; |
1559 | Lmax = rep_max[fc]; |
1560 | reptype = rep_typ[fc]; |
1561 | |
1562 | /* Common code for all repeated single-character non-matches. */ |
1563 | |
1564 | REPEATNOTCHAR: |
1565 | GETCHARINCTEST(Lc, Fecode); |
1566 | |
1567 | /* The code is duplicated for the caseless and caseful cases, for speed, |
1568 | since matching characters is likely to be quite common. First, ensure the |
1569 | minimum number of matches are present. If Lmin = Lmax, we are done. |
1570 | Otherwise, if minimizing, keep trying the rest of the expression and |
1571 | advancing one matching character if failing, up to the maximum. |
1572 | Alternatively, if maximizing, find the maximum number of characters and |
1573 | work backwards. */ |
1574 | |
1575 | if (Fop >= OP_NOTSTARI) /* Caseless */ |
1576 | { |
1577 | #ifdef SUPPORT_UNICODE |
1578 | if ((utf || ucp) && Lc > 127) |
1579 | Loc = UCD_OTHERCASE(Lc); |
1580 | else |
1581 | #endif /* SUPPORT_UNICODE */ |
1582 | |
1583 | Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */ |
1584 | |
1585 | #ifdef SUPPORT_UNICODE |
1586 | if (utf) |
1587 | { |
1588 | uint32_t d; |
1589 | for (i = 1; i <= Lmin; i++) |
1590 | { |
1591 | if (Feptr >= mb->end_subject) |
1592 | { |
1593 | SCHECK_PARTIAL(); |
1594 | RRETURN(MATCH_NOMATCH); |
1595 | } |
1596 | GETCHARINC(d, Feptr); |
1597 | if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH); |
1598 | } |
1599 | } |
1600 | else |
1601 | #endif /* SUPPORT_UNICODE */ |
1602 | |
1603 | /* Not UTF mode */ |
1604 | { |
1605 | for (i = 1; i <= Lmin; i++) |
1606 | { |
1607 | if (Feptr >= mb->end_subject) |
1608 | { |
1609 | SCHECK_PARTIAL(); |
1610 | RRETURN(MATCH_NOMATCH); |
1611 | } |
1612 | if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH); |
1613 | Feptr++; |
1614 | } |
1615 | } |
1616 | |
1617 | if (Lmin == Lmax) continue; /* Finished for exact count */ |
1618 | |
1619 | if (reptype == REPTYPE_MIN) |
1620 | { |
1621 | #ifdef SUPPORT_UNICODE |
1622 | if (utf) |
1623 | { |
1624 | uint32_t d; |
1625 | for (;;) |
1626 | { |
1627 | RMATCH(Fecode, RM204); |
1628 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1629 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1630 | if (Feptr >= mb->end_subject) |
1631 | { |
1632 | SCHECK_PARTIAL(); |
1633 | RRETURN(MATCH_NOMATCH); |
1634 | } |
1635 | GETCHARINC(d, Feptr); |
1636 | if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH); |
1637 | } |
1638 | } |
1639 | else |
1640 | #endif /*SUPPORT_UNICODE */ |
1641 | |
1642 | /* Not UTF mode */ |
1643 | { |
1644 | for (;;) |
1645 | { |
1646 | RMATCH(Fecode, RM29); |
1647 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1648 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1649 | if (Feptr >= mb->end_subject) |
1650 | { |
1651 | SCHECK_PARTIAL(); |
1652 | RRETURN(MATCH_NOMATCH); |
1653 | } |
1654 | if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH); |
1655 | Feptr++; |
1656 | } |
1657 | } |
1658 | /* Control never gets here */ |
1659 | } |
1660 | |
1661 | /* Maximize case */ |
1662 | |
1663 | else |
1664 | { |
1665 | Lstart_eptr = Feptr; |
1666 | |
1667 | #ifdef SUPPORT_UNICODE |
1668 | if (utf) |
1669 | { |
1670 | uint32_t d; |
1671 | for (i = Lmin; i < Lmax; i++) |
1672 | { |
1673 | int len = 1; |
1674 | if (Feptr >= mb->end_subject) |
1675 | { |
1676 | SCHECK_PARTIAL(); |
1677 | break; |
1678 | } |
1679 | GETCHARLEN(d, Feptr, len); |
1680 | if (Lc == d || Loc == d) break; |
1681 | Feptr += len; |
1682 | } |
1683 | |
1684 | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
1685 | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
1686 | go too far. */ |
1687 | |
1688 | if (reptype != REPTYPE_POS) for(;;) |
1689 | { |
1690 | if (Feptr <= Lstart_eptr) break; |
1691 | RMATCH(Fecode, RM205); |
1692 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1693 | Feptr--; |
1694 | BACKCHAR(Feptr); |
1695 | } |
1696 | } |
1697 | else |
1698 | #endif /* SUPPORT_UNICODE */ |
1699 | |
1700 | /* Not UTF mode */ |
1701 | { |
1702 | for (i = Lmin; i < Lmax; i++) |
1703 | { |
1704 | if (Feptr >= mb->end_subject) |
1705 | { |
1706 | SCHECK_PARTIAL(); |
1707 | break; |
1708 | } |
1709 | if (Lc == *Feptr || Loc == *Feptr) break; |
1710 | Feptr++; |
1711 | } |
1712 | if (reptype != REPTYPE_POS) for (;;) |
1713 | { |
1714 | if (Feptr == Lstart_eptr) break; |
1715 | RMATCH(Fecode, RM30); |
1716 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1717 | Feptr--; |
1718 | } |
1719 | } |
1720 | } |
1721 | } |
1722 | |
1723 | /* Caseful comparisons */ |
1724 | |
1725 | else |
1726 | { |
1727 | #ifdef SUPPORT_UNICODE |
1728 | if (utf) |
1729 | { |
1730 | uint32_t d; |
1731 | for (i = 1; i <= Lmin; i++) |
1732 | { |
1733 | if (Feptr >= mb->end_subject) |
1734 | { |
1735 | SCHECK_PARTIAL(); |
1736 | RRETURN(MATCH_NOMATCH); |
1737 | } |
1738 | GETCHARINC(d, Feptr); |
1739 | if (Lc == d) RRETURN(MATCH_NOMATCH); |
1740 | } |
1741 | } |
1742 | else |
1743 | #endif |
1744 | /* Not UTF mode */ |
1745 | { |
1746 | for (i = 1; i <= Lmin; i++) |
1747 | { |
1748 | if (Feptr >= mb->end_subject) |
1749 | { |
1750 | SCHECK_PARTIAL(); |
1751 | RRETURN(MATCH_NOMATCH); |
1752 | } |
1753 | if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH); |
1754 | } |
1755 | } |
1756 | |
1757 | if (Lmin == Lmax) continue; |
1758 | |
1759 | if (reptype == REPTYPE_MIN) |
1760 | { |
1761 | #ifdef SUPPORT_UNICODE |
1762 | if (utf) |
1763 | { |
1764 | uint32_t d; |
1765 | for (;;) |
1766 | { |
1767 | RMATCH(Fecode, RM206); |
1768 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1769 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1770 | if (Feptr >= mb->end_subject) |
1771 | { |
1772 | SCHECK_PARTIAL(); |
1773 | RRETURN(MATCH_NOMATCH); |
1774 | } |
1775 | GETCHARINC(d, Feptr); |
1776 | if (Lc == d) RRETURN(MATCH_NOMATCH); |
1777 | } |
1778 | } |
1779 | else |
1780 | #endif |
1781 | /* Not UTF mode */ |
1782 | { |
1783 | for (;;) |
1784 | { |
1785 | RMATCH(Fecode, RM31); |
1786 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1787 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1788 | if (Feptr >= mb->end_subject) |
1789 | { |
1790 | SCHECK_PARTIAL(); |
1791 | RRETURN(MATCH_NOMATCH); |
1792 | } |
1793 | if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH); |
1794 | } |
1795 | } |
1796 | /* Control never gets here */ |
1797 | } |
1798 | |
1799 | /* Maximize case */ |
1800 | |
1801 | else |
1802 | { |
1803 | Lstart_eptr = Feptr; |
1804 | |
1805 | #ifdef SUPPORT_UNICODE |
1806 | if (utf) |
1807 | { |
1808 | uint32_t d; |
1809 | for (i = Lmin; i < Lmax; i++) |
1810 | { |
1811 | int len = 1; |
1812 | if (Feptr >= mb->end_subject) |
1813 | { |
1814 | SCHECK_PARTIAL(); |
1815 | break; |
1816 | } |
1817 | GETCHARLEN(d, Feptr, len); |
1818 | if (Lc == d) break; |
1819 | Feptr += len; |
1820 | } |
1821 | |
1822 | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
1823 | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
1824 | go too far. */ |
1825 | |
1826 | if (reptype != REPTYPE_POS) for(;;) |
1827 | { |
1828 | if (Feptr <= Lstart_eptr) break; |
1829 | RMATCH(Fecode, RM207); |
1830 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1831 | Feptr--; |
1832 | BACKCHAR(Feptr); |
1833 | } |
1834 | } |
1835 | else |
1836 | #endif |
1837 | /* Not UTF mode */ |
1838 | { |
1839 | for (i = Lmin; i < Lmax; i++) |
1840 | { |
1841 | if (Feptr >= mb->end_subject) |
1842 | { |
1843 | SCHECK_PARTIAL(); |
1844 | break; |
1845 | } |
1846 | if (Lc == *Feptr) break; |
1847 | Feptr++; |
1848 | } |
1849 | if (reptype != REPTYPE_POS) for (;;) |
1850 | { |
1851 | if (Feptr == Lstart_eptr) break; |
1852 | RMATCH(Fecode, RM32); |
1853 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1854 | Feptr--; |
1855 | } |
1856 | } |
1857 | } |
1858 | } |
1859 | break; |
1860 | |
1861 | #undef Lstart_eptr |
1862 | #undef Lmin |
1863 | #undef Lmax |
1864 | #undef Lc |
1865 | #undef Loc |
1866 | |
1867 | |
1868 | /* ===================================================================== */ |
1869 | /* Match a bit-mapped character class, possibly repeatedly. These opcodes |
1870 | are used when all the characters in the class have values in the range |
1871 | 0-255, and either the matching is caseful, or the characters are in the |
1872 | range 0-127 when UTF processing is enabled. The only difference between |
1873 | OP_CLASS and OP_NCLASS occurs when a data character outside the range is |
1874 | encountered. */ |
1875 | |
1876 | #define Lmin F->temp_32[0] |
1877 | #define Lmax F->temp_32[1] |
1878 | #define Lstart_eptr F->temp_sptr[0] |
1879 | #define Lbyte_map_address F->temp_sptr[1] |
1880 | #define Lbyte_map ((unsigned char *)Lbyte_map_address) |
1881 | |
1882 | case OP_NCLASS: |
1883 | case OP_CLASS: |
1884 | { |
1885 | Lbyte_map_address = Fecode + 1; /* Save for matching */ |
1886 | Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */ |
1887 | |
1888 | /* Look past the end of the item to see if there is repeat information |
1889 | following. Then obey similar code to character type repeats. */ |
1890 | |
1891 | switch (*Fecode) |
1892 | { |
1893 | case OP_CRSTAR: |
1894 | case OP_CRMINSTAR: |
1895 | case OP_CRPLUS: |
1896 | case OP_CRMINPLUS: |
1897 | case OP_CRQUERY: |
1898 | case OP_CRMINQUERY: |
1899 | case OP_CRPOSSTAR: |
1900 | case OP_CRPOSPLUS: |
1901 | case OP_CRPOSQUERY: |
1902 | fc = *Fecode++ - OP_CRSTAR; |
1903 | Lmin = rep_min[fc]; |
1904 | Lmax = rep_max[fc]; |
1905 | reptype = rep_typ[fc]; |
1906 | break; |
1907 | |
1908 | case OP_CRRANGE: |
1909 | case OP_CRMINRANGE: |
1910 | case OP_CRPOSRANGE: |
1911 | Lmin = GET2(Fecode, 1); |
1912 | Lmax = GET2(Fecode, 1 + IMM2_SIZE); |
1913 | if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ |
1914 | reptype = rep_typ[*Fecode - OP_CRSTAR]; |
1915 | Fecode += 1 + 2 * IMM2_SIZE; |
1916 | break; |
1917 | |
1918 | default: /* No repeat follows */ |
1919 | Lmin = Lmax = 1; |
1920 | break; |
1921 | } |
1922 | |
1923 | /* First, ensure the minimum number of matches are present. */ |
1924 | |
1925 | #ifdef SUPPORT_UNICODE |
1926 | if (utf) |
1927 | { |
1928 | for (i = 1; i <= Lmin; i++) |
1929 | { |
1930 | if (Feptr >= mb->end_subject) |
1931 | { |
1932 | SCHECK_PARTIAL(); |
1933 | RRETURN(MATCH_NOMATCH); |
1934 | } |
1935 | GETCHARINC(fc, Feptr); |
1936 | if (fc > 255) |
1937 | { |
1938 | if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); |
1939 | } |
1940 | else |
1941 | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); |
1942 | } |
1943 | } |
1944 | else |
1945 | #endif |
1946 | /* Not UTF mode */ |
1947 | { |
1948 | for (i = 1; i <= Lmin; i++) |
1949 | { |
1950 | if (Feptr >= mb->end_subject) |
1951 | { |
1952 | SCHECK_PARTIAL(); |
1953 | RRETURN(MATCH_NOMATCH); |
1954 | } |
1955 | fc = *Feptr++; |
1956 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
1957 | if (fc > 255) |
1958 | { |
1959 | if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); |
1960 | } |
1961 | else |
1962 | #endif |
1963 | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); |
1964 | } |
1965 | } |
1966 | |
1967 | /* If Lmax == Lmin we are done. Continue with main loop. */ |
1968 | |
1969 | if (Lmin == Lmax) continue; |
1970 | |
1971 | /* If minimizing, keep testing the rest of the expression and advancing |
1972 | the pointer while it matches the class. */ |
1973 | |
1974 | if (reptype == REPTYPE_MIN) |
1975 | { |
1976 | #ifdef SUPPORT_UNICODE |
1977 | if (utf) |
1978 | { |
1979 | for (;;) |
1980 | { |
1981 | RMATCH(Fecode, RM200); |
1982 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
1983 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
1984 | if (Feptr >= mb->end_subject) |
1985 | { |
1986 | SCHECK_PARTIAL(); |
1987 | RRETURN(MATCH_NOMATCH); |
1988 | } |
1989 | GETCHARINC(fc, Feptr); |
1990 | if (fc > 255) |
1991 | { |
1992 | if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); |
1993 | } |
1994 | else |
1995 | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); |
1996 | } |
1997 | } |
1998 | else |
1999 | #endif |
2000 | /* Not UTF mode */ |
2001 | { |
2002 | for (;;) |
2003 | { |
2004 | RMATCH(Fecode, RM23); |
2005 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2006 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
2007 | if (Feptr >= mb->end_subject) |
2008 | { |
2009 | SCHECK_PARTIAL(); |
2010 | RRETURN(MATCH_NOMATCH); |
2011 | } |
2012 | fc = *Feptr++; |
2013 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
2014 | if (fc > 255) |
2015 | { |
2016 | if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); |
2017 | } |
2018 | else |
2019 | #endif |
2020 | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); |
2021 | } |
2022 | } |
2023 | /* Control never gets here */ |
2024 | } |
2025 | |
2026 | /* If maximizing, find the longest possible run, then work backwards. */ |
2027 | |
2028 | else |
2029 | { |
2030 | Lstart_eptr = Feptr; |
2031 | |
2032 | #ifdef SUPPORT_UNICODE |
2033 | if (utf) |
2034 | { |
2035 | for (i = Lmin; i < Lmax; i++) |
2036 | { |
2037 | int len = 1; |
2038 | if (Feptr >= mb->end_subject) |
2039 | { |
2040 | SCHECK_PARTIAL(); |
2041 | break; |
2042 | } |
2043 | GETCHARLEN(fc, Feptr, len); |
2044 | if (fc > 255) |
2045 | { |
2046 | if (Fop == OP_CLASS) break; |
2047 | } |
2048 | else |
2049 | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break; |
2050 | Feptr += len; |
2051 | } |
2052 | |
2053 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
2054 | |
2055 | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
2056 | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
2057 | go too far. */ |
2058 | |
2059 | for (;;) |
2060 | { |
2061 | RMATCH(Fecode, RM201); |
2062 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2063 | if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ |
2064 | BACKCHAR(Feptr); |
2065 | } |
2066 | } |
2067 | else |
2068 | #endif |
2069 | /* Not UTF mode */ |
2070 | { |
2071 | for (i = Lmin; i < Lmax; i++) |
2072 | { |
2073 | if (Feptr >= mb->end_subject) |
2074 | { |
2075 | SCHECK_PARTIAL(); |
2076 | break; |
2077 | } |
2078 | fc = *Feptr; |
2079 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
2080 | if (fc > 255) |
2081 | { |
2082 | if (Fop == OP_CLASS) break; |
2083 | } |
2084 | else |
2085 | #endif |
2086 | if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break; |
2087 | Feptr++; |
2088 | } |
2089 | |
2090 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
2091 | |
2092 | while (Feptr >= Lstart_eptr) |
2093 | { |
2094 | RMATCH(Fecode, RM24); |
2095 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2096 | Feptr--; |
2097 | } |
2098 | } |
2099 | |
2100 | RRETURN(MATCH_NOMATCH); |
2101 | } |
2102 | } |
2103 | /* Control never gets here */ |
2104 | |
2105 | #undef Lbyte_map_address |
2106 | #undef Lbyte_map |
2107 | #undef Lstart_eptr |
2108 | #undef Lmin |
2109 | #undef Lmax |
2110 | |
2111 | |
2112 | /* ===================================================================== */ |
2113 | /* Match an extended character class. In the 8-bit library, this opcode is |
2114 | encountered only when UTF-8 mode mode is supported. In the 16-bit and |
2115 | 32-bit libraries, codepoints greater than 255 may be encountered even when |
2116 | UTF is not supported. */ |
2117 | |
2118 | #define Lstart_eptr F->temp_sptr[0] |
2119 | #define Lxclass_data F->temp_sptr[1] |
2120 | #define Lmin F->temp_32[0] |
2121 | #define Lmax F->temp_32[1] |
2122 | |
2123 | #ifdef SUPPORT_WIDE_CHARS |
2124 | case OP_XCLASS: |
2125 | { |
2126 | Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */ |
2127 | Fecode += GET(Fecode, 1); /* Advance past the item */ |
2128 | |
2129 | switch (*Fecode) |
2130 | { |
2131 | case OP_CRSTAR: |
2132 | case OP_CRMINSTAR: |
2133 | case OP_CRPLUS: |
2134 | case OP_CRMINPLUS: |
2135 | case OP_CRQUERY: |
2136 | case OP_CRMINQUERY: |
2137 | case OP_CRPOSSTAR: |
2138 | case OP_CRPOSPLUS: |
2139 | case OP_CRPOSQUERY: |
2140 | fc = *Fecode++ - OP_CRSTAR; |
2141 | Lmin = rep_min[fc]; |
2142 | Lmax = rep_max[fc]; |
2143 | reptype = rep_typ[fc]; |
2144 | break; |
2145 | |
2146 | case OP_CRRANGE: |
2147 | case OP_CRMINRANGE: |
2148 | case OP_CRPOSRANGE: |
2149 | Lmin = GET2(Fecode, 1); |
2150 | Lmax = GET2(Fecode, 1 + IMM2_SIZE); |
2151 | if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ |
2152 | reptype = rep_typ[*Fecode - OP_CRSTAR]; |
2153 | Fecode += 1 + 2 * IMM2_SIZE; |
2154 | break; |
2155 | |
2156 | default: /* No repeat follows */ |
2157 | Lmin = Lmax = 1; |
2158 | break; |
2159 | } |
2160 | |
2161 | /* First, ensure the minimum number of matches are present. */ |
2162 | |
2163 | for (i = 1; i <= Lmin; i++) |
2164 | { |
2165 | if (Feptr >= mb->end_subject) |
2166 | { |
2167 | SCHECK_PARTIAL(); |
2168 | RRETURN(MATCH_NOMATCH); |
2169 | } |
2170 | GETCHARINCTEST(fc, Feptr); |
2171 | if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH); |
2172 | } |
2173 | |
2174 | /* If Lmax == Lmin we can just continue with the main loop. */ |
2175 | |
2176 | if (Lmin == Lmax) continue; |
2177 | |
2178 | /* If minimizing, keep testing the rest of the expression and advancing |
2179 | the pointer while it matches the class. */ |
2180 | |
2181 | if (reptype == REPTYPE_MIN) |
2182 | { |
2183 | for (;;) |
2184 | { |
2185 | RMATCH(Fecode, RM100); |
2186 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2187 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
2188 | if (Feptr >= mb->end_subject) |
2189 | { |
2190 | SCHECK_PARTIAL(); |
2191 | RRETURN(MATCH_NOMATCH); |
2192 | } |
2193 | GETCHARINCTEST(fc, Feptr); |
2194 | if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH); |
2195 | } |
2196 | /* Control never gets here */ |
2197 | } |
2198 | |
2199 | /* If maximizing, find the longest possible run, then work backwards. */ |
2200 | |
2201 | else |
2202 | { |
2203 | Lstart_eptr = Feptr; |
2204 | for (i = Lmin; i < Lmax; i++) |
2205 | { |
2206 | int len = 1; |
2207 | if (Feptr >= mb->end_subject) |
2208 | { |
2209 | SCHECK_PARTIAL(); |
2210 | break; |
2211 | } |
2212 | #ifdef SUPPORT_UNICODE |
2213 | GETCHARLENTEST(fc, Feptr, len); |
2214 | #else |
2215 | fc = *Feptr; |
2216 | #endif |
2217 | if (!PRIV(xclass)(fc, Lxclass_data, utf)) break; |
2218 | Feptr += len; |
2219 | } |
2220 | |
2221 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
2222 | |
2223 | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
2224 | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
2225 | go too far. */ |
2226 | |
2227 | for(;;) |
2228 | { |
2229 | RMATCH(Fecode, RM101); |
2230 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
2231 | if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ |
2232 | #ifdef SUPPORT_UNICODE |
2233 | if (utf) BACKCHAR(Feptr); |
2234 | #endif |
2235 | } |
2236 | RRETURN(MATCH_NOMATCH); |
2237 | } |
2238 | |
2239 | /* Control never gets here */ |
2240 | } |
2241 | #endif /* SUPPORT_WIDE_CHARS: end of XCLASS */ |
2242 | |
2243 | #undef Lstart_eptr |
2244 | #undef Lxclass_data |
2245 | #undef Lmin |
2246 | #undef Lmax |
2247 | |
2248 | |
2249 | /* ===================================================================== */ |
2250 | /* Match various character types when PCRE2_UCP is not set. These opcodes |
2251 | are not generated when PCRE2_UCP is set - instead appropriate property |
2252 | tests are compiled. */ |
2253 | |
2254 | case OP_NOT_DIGIT: |
2255 | if (Feptr >= mb->end_subject) |
2256 | { |
2257 | SCHECK_PARTIAL(); |
2258 | RRETURN(MATCH_NOMATCH); |
2259 | } |
2260 | GETCHARINCTEST(fc, Feptr); |
2261 | if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0) |
2262 | RRETURN(MATCH_NOMATCH); |
2263 | Fecode++; |
2264 | break; |
2265 | |
2266 | case OP_DIGIT: |
2267 | if (Feptr >= mb->end_subject) |
2268 | { |
2269 | SCHECK_PARTIAL(); |
2270 | RRETURN(MATCH_NOMATCH); |
2271 | } |
2272 | GETCHARINCTEST(fc, Feptr); |
2273 | if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0) |
2274 | RRETURN(MATCH_NOMATCH); |
2275 | Fecode++; |
2276 | break; |
2277 | |
2278 | case OP_NOT_WHITESPACE: |
2279 | if (Feptr >= mb->end_subject) |
2280 | { |
2281 | SCHECK_PARTIAL(); |
2282 | RRETURN(MATCH_NOMATCH); |
2283 | } |
2284 | GETCHARINCTEST(fc, Feptr); |
2285 | if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0) |
2286 | RRETURN(MATCH_NOMATCH); |
2287 | Fecode++; |
2288 | break; |
2289 | |
2290 | case OP_WHITESPACE: |
2291 | if (Feptr >= mb->end_subject) |
2292 | { |
2293 | SCHECK_PARTIAL(); |
2294 | RRETURN(MATCH_NOMATCH); |
2295 | } |
2296 | GETCHARINCTEST(fc, Feptr); |
2297 | if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0) |
2298 | RRETURN(MATCH_NOMATCH); |
2299 | Fecode++; |
2300 | break; |
2301 | |
2302 | case OP_NOT_WORDCHAR: |
2303 | if (Feptr >= mb->end_subject) |
2304 | { |
2305 | SCHECK_PARTIAL(); |
2306 | RRETURN(MATCH_NOMATCH); |
2307 | } |
2308 | GETCHARINCTEST(fc, Feptr); |
2309 | if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0) |
2310 | RRETURN(MATCH_NOMATCH); |
2311 | Fecode++; |
2312 | break; |
2313 | |
2314 | case OP_WORDCHAR: |
2315 | if (Feptr >= mb->end_subject) |
2316 | { |
2317 | SCHECK_PARTIAL(); |
2318 | RRETURN(MATCH_NOMATCH); |
2319 | } |
2320 | GETCHARINCTEST(fc, Feptr); |
2321 | if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0) |
2322 | RRETURN(MATCH_NOMATCH); |
2323 | Fecode++; |
2324 | break; |
2325 | |
2326 | case OP_ANYNL: |
2327 | if (Feptr >= mb->end_subject) |
2328 | { |
2329 | SCHECK_PARTIAL(); |
2330 | RRETURN(MATCH_NOMATCH); |
2331 | } |
2332 | GETCHARINCTEST(fc, Feptr); |
2333 | switch(fc) |
2334 | { |
2335 | default: RRETURN(MATCH_NOMATCH); |
2336 | |
2337 | case CHAR_CR: |
2338 | if (Feptr >= mb->end_subject) |
2339 | { |
2340 | SCHECK_PARTIAL(); |
2341 | } |
2342 | else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++; |
2343 | break; |
2344 | |
2345 | case CHAR_LF: |
2346 | break; |
2347 | |
2348 | case CHAR_VT: |
2349 | case CHAR_FF: |
2350 | case CHAR_NEL: |
2351 | #ifndef EBCDIC |
2352 | case 0x2028: |
2353 | case 0x2029: |
2354 | #endif /* Not EBCDIC */ |
2355 | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); |
2356 | break; |
2357 | } |
2358 | Fecode++; |
2359 | break; |
2360 | |
2361 | case OP_NOT_HSPACE: |
2362 | if (Feptr >= mb->end_subject) |
2363 | { |
2364 | SCHECK_PARTIAL(); |
2365 | RRETURN(MATCH_NOMATCH); |
2366 | } |
2367 | GETCHARINCTEST(fc, Feptr); |
2368 | switch(fc) |
2369 | { |
2370 | HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */ |
2371 | default: break; |
2372 | } |
2373 | Fecode++; |
2374 | break; |
2375 | |
2376 | case OP_HSPACE: |
2377 | if (Feptr >= mb->end_subject) |
2378 | { |
2379 | SCHECK_PARTIAL(); |
2380 | RRETURN(MATCH_NOMATCH); |
2381 | } |
2382 | GETCHARINCTEST(fc, Feptr); |
2383 | switch(fc) |
2384 | { |
2385 | HSPACE_CASES: break; /* Byte and multibyte cases */ |
2386 | default: RRETURN(MATCH_NOMATCH); |
2387 | } |
2388 | Fecode++; |
2389 | break; |
2390 | |
2391 | case OP_NOT_VSPACE: |
2392 | if (Feptr >= mb->end_subject) |
2393 | { |
2394 | SCHECK_PARTIAL(); |
2395 | RRETURN(MATCH_NOMATCH); |
2396 | } |
2397 | GETCHARINCTEST(fc, Feptr); |
2398 | switch(fc) |
2399 | { |
2400 | VSPACE_CASES: RRETURN(MATCH_NOMATCH); |
2401 | default: break; |
2402 | } |
2403 | Fecode++; |
2404 | break; |
2405 | |
2406 | case OP_VSPACE: |
2407 | if (Feptr >= mb->end_subject) |
2408 | { |
2409 | SCHECK_PARTIAL(); |
2410 | RRETURN(MATCH_NOMATCH); |
2411 | } |
2412 | GETCHARINCTEST(fc, Feptr); |
2413 | switch(fc) |
2414 | { |
2415 | VSPACE_CASES: break; |
2416 | default: RRETURN(MATCH_NOMATCH); |
2417 | } |
2418 | Fecode++; |
2419 | break; |
2420 | |
2421 | |
2422 | #ifdef SUPPORT_UNICODE |
2423 | |
2424 | /* ===================================================================== */ |
2425 | /* Check the next character by Unicode property. We will get here only |
2426 | if the support is in the binary; otherwise a compile-time error occurs. */ |
2427 | |
2428 | case OP_PROP: |
2429 | case OP_NOTPROP: |
2430 | if (Feptr >= mb->end_subject) |
2431 | { |
2432 | SCHECK_PARTIAL(); |
2433 | RRETURN(MATCH_NOMATCH); |
2434 | } |
2435 | GETCHARINCTEST(fc, Feptr); |
2436 | { |
2437 | const uint32_t *cp; |
2438 | const ucd_record *prop = GET_UCD(fc); |
2439 | BOOL notmatch = Fop == OP_NOTPROP; |
2440 | |
2441 | switch(Fecode[1]) |
2442 | { |
2443 | case PT_ANY: |
2444 | if (notmatch) RRETURN(MATCH_NOMATCH); |
2445 | break; |
2446 | |
2447 | case PT_LAMP: |
2448 | if ((prop->chartype == ucp_Lu || |
2449 | prop->chartype == ucp_Ll || |
2450 | prop->chartype == ucp_Lt) == notmatch) |
2451 | RRETURN(MATCH_NOMATCH); |
2452 | break; |
2453 | |
2454 | case PT_GC: |
2455 | if ((Fecode[2] == PRIV(ucp_gentype)[prop->chartype]) == notmatch) |
2456 | RRETURN(MATCH_NOMATCH); |
2457 | break; |
2458 | |
2459 | case PT_PC: |
2460 | if ((Fecode[2] == prop->chartype) == notmatch) |
2461 | RRETURN(MATCH_NOMATCH); |
2462 | break; |
2463 | |
2464 | case PT_SC: |
2465 | if ((Fecode[2] == prop->script) == notmatch) |
2466 | RRETURN(MATCH_NOMATCH); |
2467 | break; |
2468 | |
2469 | case PT_SCX: |
2470 | { |
2471 | BOOL ok = (Fecode[2] == prop->script || |
2472 | MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Fecode[2]) != 0); |
2473 | if (ok == notmatch) RRETURN(MATCH_NOMATCH); |
2474 | } |
2475 | break; |
2476 | |
2477 | /* These are specials */ |
2478 | |
2479 | case PT_ALNUM: |
2480 | if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || |
2481 | PRIV(ucp_gentype)[prop->chartype] == ucp_N) == notmatch) |
2482 | RRETURN(MATCH_NOMATCH); |
2483 | break; |
2484 | |
2485 | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
2486 | which means that Perl space and POSIX space are now identical. PCRE |
2487 | was changed at release 8.34. */ |
2488 | |
2489 | case PT_SPACE: /* Perl space */ |
2490 | case PT_PXSPACE: /* POSIX space */ |
2491 | switch(fc) |
2492 | { |
2493 | HSPACE_CASES: |
2494 | VSPACE_CASES: |
2495 | if (notmatch) RRETURN(MATCH_NOMATCH); |
2496 | break; |
2497 | |
2498 | default: |
2499 | if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == notmatch) |
2500 | RRETURN(MATCH_NOMATCH); |
2501 | break; |
2502 | } |
2503 | break; |
2504 | |
2505 | case PT_WORD: |
2506 | if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || |
2507 | PRIV(ucp_gentype)[prop->chartype] == ucp_N || |
2508 | fc == CHAR_UNDERSCORE) == notmatch) |
2509 | RRETURN(MATCH_NOMATCH); |
2510 | break; |
2511 | |
2512 | case PT_CLIST: |
2513 | cp = PRIV(ucd_caseless_sets) + Fecode[2]; |
2514 | for (;;) |
2515 | { |
2516 | if (fc < *cp) |
2517 | { if (notmatch) break; else { RRETURN(MATCH_NOMATCH); } } |
2518 | if (fc == *cp++) |
2519 | { if (notmatch) { RRETURN(MATCH_NOMATCH); } else break; } |
2520 | } |
2521 | break; |
2522 | |
2523 | case PT_UCNC: |
2524 | if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || |
2525 | fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || |
2526 | fc >= 0xe000) == notmatch) |
2527 | RRETURN(MATCH_NOMATCH); |
2528 | break; |
2529 | |
2530 | case PT_BIDICL: |
2531 | if ((UCD_BIDICLASS_PROP(prop) == Fecode[2]) == notmatch) |
2532 | RRETURN(MATCH_NOMATCH); |
2533 | break; |
2534 | |
2535 | case PT_BOOL: |
2536 | { |
2537 | BOOL ok = MAPBIT(PRIV(ucd_boolprop_sets) + |
2538 | UCD_BPROPS_PROP(prop), Fecode[2]) != 0; |
2539 | if (ok == notmatch) RRETURN(MATCH_NOMATCH); |
2540 | } |
2541 | break; |
2542 | |
2543 | /* This should never occur */ |
2544 | |
2545 | default: |
2546 | return PCRE2_ERROR_INTERNAL; |
2547 | } |
2548 | |
2549 | Fecode += 3; |
2550 | } |
2551 | break; |
2552 | |
2553 | |
2554 | /* ===================================================================== */ |
2555 | /* Match an extended Unicode sequence. We will get here only if the support |
2556 | is in the binary; otherwise a compile-time error occurs. */ |
2557 | |
2558 | case OP_EXTUNI: |
2559 | if (Feptr >= mb->end_subject) |
2560 | { |
2561 | SCHECK_PARTIAL(); |
2562 | RRETURN(MATCH_NOMATCH); |
2563 | } |
2564 | else |
2565 | { |
2566 | GETCHARINCTEST(fc, Feptr); |
2567 | Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf, |
2568 | NULL); |
2569 | } |
2570 | CHECK_PARTIAL(); |
2571 | Fecode++; |
2572 | break; |
2573 | |
2574 | #endif /* SUPPORT_UNICODE */ |
2575 | |
2576 | |
2577 | /* ===================================================================== */ |
2578 | /* Match a single character type repeatedly. Note that the property type |
2579 | does not need to be in a stack frame as it is not used within an RMATCH() |
2580 | loop. */ |
2581 | |
2582 | #define Lstart_eptr F->temp_sptr[0] |
2583 | #define Lmin F->temp_32[0] |
2584 | #define Lmax F->temp_32[1] |
2585 | #define Lctype F->temp_32[2] |
2586 | #define Lpropvalue F->temp_32[3] |
2587 | |
2588 | case OP_TYPEEXACT: |
2589 | Lmin = Lmax = GET2(Fecode, 1); |
2590 | Fecode += 1 + IMM2_SIZE; |
2591 | goto REPEATTYPE; |
2592 | |
2593 | case OP_TYPEUPTO: |
2594 | case OP_TYPEMINUPTO: |
2595 | Lmin = 0; |
2596 | Lmax = GET2(Fecode, 1); |
2597 | reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX; |
2598 | Fecode += 1 + IMM2_SIZE; |
2599 | goto REPEATTYPE; |
2600 | |
2601 | case OP_TYPEPOSSTAR: |
2602 | reptype = REPTYPE_POS; |
2603 | Lmin = 0; |
2604 | Lmax = UINT32_MAX; |
2605 | Fecode++; |
2606 | goto REPEATTYPE; |
2607 | |
2608 | case OP_TYPEPOSPLUS: |
2609 | reptype = REPTYPE_POS; |
2610 | Lmin = 1; |
2611 | Lmax = UINT32_MAX; |
2612 | Fecode++; |
2613 | goto REPEATTYPE; |
2614 | |
2615 | case OP_TYPEPOSQUERY: |
2616 | reptype = REPTYPE_POS; |
2617 | Lmin = 0; |
2618 | Lmax = 1; |
2619 | Fecode++; |
2620 | goto REPEATTYPE; |
2621 | |
2622 | case OP_TYPEPOSUPTO: |
2623 | reptype = REPTYPE_POS; |
2624 | Lmin = 0; |
2625 | Lmax = GET2(Fecode, 1); |
2626 | Fecode += 1 + IMM2_SIZE; |
2627 | goto REPEATTYPE; |
2628 | |
2629 | case OP_TYPESTAR: |
2630 | case OP_TYPEMINSTAR: |
2631 | case OP_TYPEPLUS: |
2632 | case OP_TYPEMINPLUS: |
2633 | case OP_TYPEQUERY: |
2634 | case OP_TYPEMINQUERY: |
2635 | fc = *Fecode++ - OP_TYPESTAR; |
2636 | Lmin = rep_min[fc]; |
2637 | Lmax = rep_max[fc]; |
2638 | reptype = rep_typ[fc]; |
2639 | |
2640 | /* Common code for all repeated character type matches. */ |
2641 | |
2642 | REPEATTYPE: |
2643 | Lctype = *Fecode++; /* Code for the character type */ |
2644 | |
2645 | #ifdef SUPPORT_UNICODE |
2646 | if (Lctype == OP_PROP || Lctype == OP_NOTPROP) |
2647 | { |
2648 | proptype = *Fecode++; |
2649 | Lpropvalue = *Fecode++; |
2650 | } |
2651 | else proptype = -1; |
2652 | #endif |
2653 | |
2654 | /* First, ensure the minimum number of matches are present. Use inline |
2655 | code for maximizing the speed, and do the type test once at the start |
2656 | (i.e. keep it out of the loops). As there are no calls to RMATCH in the |
2657 | loops, we can use an ordinary variable for "notmatch". The code for UTF |
2658 | mode is separated out for tidiness, except for Unicode property tests. */ |
2659 | |
2660 | if (Lmin > 0) |
2661 | { |
2662 | #ifdef SUPPORT_UNICODE |
2663 | if (proptype >= 0) /* Property tests in all modes */ |
2664 | { |
2665 | BOOL notmatch = Lctype == OP_NOTPROP; |
2666 | switch(proptype) |
2667 | { |
2668 | case PT_ANY: |
2669 | if (notmatch) RRETURN(MATCH_NOMATCH); |
2670 | for (i = 1; i <= Lmin; i++) |
2671 | { |
2672 | if (Feptr >= mb->end_subject) |
2673 | { |
2674 | SCHECK_PARTIAL(); |
2675 | RRETURN(MATCH_NOMATCH); |
2676 | } |
2677 | GETCHARINCTEST(fc, Feptr); |
2678 | } |
2679 | break; |
2680 | |
2681 | case PT_LAMP: |
2682 | for (i = 1; i <= Lmin; i++) |
2683 | { |
2684 | int chartype; |
2685 | if (Feptr >= mb->end_subject) |
2686 | { |
2687 | SCHECK_PARTIAL(); |
2688 | RRETURN(MATCH_NOMATCH); |
2689 | } |
2690 | GETCHARINCTEST(fc, Feptr); |
2691 | chartype = UCD_CHARTYPE(fc); |
2692 | if ((chartype == ucp_Lu || |
2693 | chartype == ucp_Ll || |
2694 | chartype == ucp_Lt) == notmatch) |
2695 | RRETURN(MATCH_NOMATCH); |
2696 | } |
2697 | break; |
2698 | |
2699 | case PT_GC: |
2700 | for (i = 1; i <= Lmin; i++) |
2701 | { |
2702 | if (Feptr >= mb->end_subject) |
2703 | { |
2704 | SCHECK_PARTIAL(); |
2705 | RRETURN(MATCH_NOMATCH); |
2706 | } |
2707 | GETCHARINCTEST(fc, Feptr); |
2708 | if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch) |
2709 | RRETURN(MATCH_NOMATCH); |
2710 | } |
2711 | break; |
2712 | |
2713 | case PT_PC: |
2714 | for (i = 1; i <= Lmin; i++) |
2715 | { |
2716 | if (Feptr >= mb->end_subject) |
2717 | { |
2718 | SCHECK_PARTIAL(); |
2719 | RRETURN(MATCH_NOMATCH); |
2720 | } |
2721 | GETCHARINCTEST(fc, Feptr); |
2722 | if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch) |
2723 | RRETURN(MATCH_NOMATCH); |
2724 | } |
2725 | break; |
2726 | |
2727 | case PT_SC: |
2728 | for (i = 1; i <= Lmin; i++) |
2729 | { |
2730 | if (Feptr >= mb->end_subject) |
2731 | { |
2732 | SCHECK_PARTIAL(); |
2733 | RRETURN(MATCH_NOMATCH); |
2734 | } |
2735 | GETCHARINCTEST(fc, Feptr); |
2736 | if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch) |
2737 | RRETURN(MATCH_NOMATCH); |
2738 | } |
2739 | break; |
2740 | |
2741 | case PT_SCX: |
2742 | for (i = 1; i <= Lmin; i++) |
2743 | { |
2744 | BOOL ok; |
2745 | const ucd_record *prop; |
2746 | if (Feptr >= mb->end_subject) |
2747 | { |
2748 | SCHECK_PARTIAL(); |
2749 | RRETURN(MATCH_NOMATCH); |
2750 | } |
2751 | GETCHARINCTEST(fc, Feptr); |
2752 | prop = GET_UCD(fc); |
2753 | ok = (prop->script == Lpropvalue || |
2754 | MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0); |
2755 | if (ok == notmatch) |
2756 | RRETURN(MATCH_NOMATCH); |
2757 | } |
2758 | break; |
2759 | |
2760 | case PT_ALNUM: |
2761 | for (i = 1; i <= Lmin; i++) |
2762 | { |
2763 | int category; |
2764 | if (Feptr >= mb->end_subject) |
2765 | { |
2766 | SCHECK_PARTIAL(); |
2767 | RRETURN(MATCH_NOMATCH); |
2768 | } |
2769 | GETCHARINCTEST(fc, Feptr); |
2770 | category = UCD_CATEGORY(fc); |
2771 | if ((category == ucp_L || category == ucp_N) == notmatch) |
2772 | RRETURN(MATCH_NOMATCH); |
2773 | } |
2774 | break; |
2775 | |
2776 | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
2777 | which means that Perl space and POSIX space are now identical. PCRE |
2778 | was changed at release 8.34. */ |
2779 | |
2780 | case PT_SPACE: /* Perl space */ |
2781 | case PT_PXSPACE: /* POSIX space */ |
2782 | for (i = 1; i <= Lmin; i++) |
2783 | { |
2784 | if (Feptr >= mb->end_subject) |
2785 | { |
2786 | SCHECK_PARTIAL(); |
2787 | RRETURN(MATCH_NOMATCH); |
2788 | } |
2789 | GETCHARINCTEST(fc, Feptr); |
2790 | switch(fc) |
2791 | { |
2792 | HSPACE_CASES: |
2793 | VSPACE_CASES: |
2794 | if (notmatch) RRETURN(MATCH_NOMATCH); |
2795 | break; |
2796 | |
2797 | default: |
2798 | if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch) |
2799 | RRETURN(MATCH_NOMATCH); |
2800 | break; |
2801 | } |
2802 | } |
2803 | break; |
2804 | |
2805 | case PT_WORD: |
2806 | for (i = 1; i <= Lmin; i++) |
2807 | { |
2808 | int category; |
2809 | if (Feptr >= mb->end_subject) |
2810 | { |
2811 | SCHECK_PARTIAL(); |
2812 | RRETURN(MATCH_NOMATCH); |
2813 | } |
2814 | GETCHARINCTEST(fc, Feptr); |
2815 | category = UCD_CATEGORY(fc); |
2816 | if ((category == ucp_L || category == ucp_N || |
2817 | fc == CHAR_UNDERSCORE) == notmatch) |
2818 | RRETURN(MATCH_NOMATCH); |
2819 | } |
2820 | break; |
2821 | |
2822 | case PT_CLIST: |
2823 | for (i = 1; i <= Lmin; i++) |
2824 | { |
2825 | const uint32_t *cp; |
2826 | if (Feptr >= mb->end_subject) |
2827 | { |
2828 | SCHECK_PARTIAL(); |
2829 | RRETURN(MATCH_NOMATCH); |
2830 | } |
2831 | GETCHARINCTEST(fc, Feptr); |
2832 | cp = PRIV(ucd_caseless_sets) + Lpropvalue; |
2833 | for (;;) |
2834 | { |
2835 | if (fc < *cp) |
2836 | { |
2837 | if (notmatch) break; |
2838 | RRETURN(MATCH_NOMATCH); |
2839 | } |
2840 | if (fc == *cp++) |
2841 | { |
2842 | if (notmatch) RRETURN(MATCH_NOMATCH); |
2843 | break; |
2844 | } |
2845 | } |
2846 | } |
2847 | break; |
2848 | |
2849 | case PT_UCNC: |
2850 | for (i = 1; i <= Lmin; i++) |
2851 | { |
2852 | if (Feptr >= mb->end_subject) |
2853 | { |
2854 | SCHECK_PARTIAL(); |
2855 | RRETURN(MATCH_NOMATCH); |
2856 | } |
2857 | GETCHARINCTEST(fc, Feptr); |
2858 | if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || |
2859 | fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || |
2860 | fc >= 0xe000) == notmatch) |
2861 | RRETURN(MATCH_NOMATCH); |
2862 | } |
2863 | break; |
2864 | |
2865 | case PT_BIDICL: |
2866 | for (i = 1; i <= Lmin; i++) |
2867 | { |
2868 | if (Feptr >= mb->end_subject) |
2869 | { |
2870 | SCHECK_PARTIAL(); |
2871 | RRETURN(MATCH_NOMATCH); |
2872 | } |
2873 | GETCHARINCTEST(fc, Feptr); |
2874 | if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch) |
2875 | RRETURN(MATCH_NOMATCH); |
2876 | } |
2877 | break; |
2878 | |
2879 | case PT_BOOL: |
2880 | for (i = 1; i <= Lmin; i++) |
2881 | { |
2882 | BOOL ok; |
2883 | const ucd_record *prop; |
2884 | if (Feptr >= mb->end_subject) |
2885 | { |
2886 | SCHECK_PARTIAL(); |
2887 | RRETURN(MATCH_NOMATCH); |
2888 | } |
2889 | GETCHARINCTEST(fc, Feptr); |
2890 | prop = GET_UCD(fc); |
2891 | ok = MAPBIT(PRIV(ucd_boolprop_sets) + |
2892 | UCD_BPROPS_PROP(prop), Lpropvalue) != 0; |
2893 | if (ok == notmatch) |
2894 | RRETURN(MATCH_NOMATCH); |
2895 | } |
2896 | break; |
2897 | |
2898 | /* This should not occur */ |
2899 | |
2900 | default: |
2901 | return PCRE2_ERROR_INTERNAL; |
2902 | } |
2903 | } |
2904 | |
2905 | /* Match extended Unicode sequences. We will get here only if the |
2906 | support is in the binary; otherwise a compile-time error occurs. */ |
2907 | |
2908 | else if (Lctype == OP_EXTUNI) |
2909 | { |
2910 | for (i = 1; i <= Lmin; i++) |
2911 | { |
2912 | if (Feptr >= mb->end_subject) |
2913 | { |
2914 | SCHECK_PARTIAL(); |
2915 | RRETURN(MATCH_NOMATCH); |
2916 | } |
2917 | else |
2918 | { |
2919 | GETCHARINCTEST(fc, Feptr); |
2920 | Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, |
2921 | mb->end_subject, utf, NULL); |
2922 | } |
2923 | CHECK_PARTIAL(); |
2924 | } |
2925 | } |
2926 | else |
2927 | #endif /* SUPPORT_UNICODE */ |
2928 | |
2929 | /* Handle all other cases in UTF mode */ |
2930 | |
2931 | #ifdef SUPPORT_UNICODE |
2932 | if (utf) switch(Lctype) |
2933 | { |
2934 | case OP_ANY: |
2935 | for (i = 1; i <= Lmin; i++) |
2936 | { |
2937 | if (Feptr >= mb->end_subject) |
2938 | { |
2939 | SCHECK_PARTIAL(); |
2940 | RRETURN(MATCH_NOMATCH); |
2941 | } |
2942 | if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); |
2943 | if (mb->partial != 0 && |
2944 | Feptr + 1 >= mb->end_subject && |
2945 | NLBLOCK->nltype == NLTYPE_FIXED && |
2946 | NLBLOCK->nllen == 2 && |
2947 | UCHAR21(Feptr) == NLBLOCK->nl[0]) |
2948 | { |
2949 | mb->hitend = TRUE; |
2950 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
2951 | } |
2952 | Feptr++; |
2953 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
2954 | } |
2955 | break; |
2956 | |
2957 | case OP_ALLANY: |
2958 | for (i = 1; i <= Lmin; i++) |
2959 | { |
2960 | if (Feptr >= mb->end_subject) |
2961 | { |
2962 | SCHECK_PARTIAL(); |
2963 | RRETURN(MATCH_NOMATCH); |
2964 | } |
2965 | Feptr++; |
2966 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
2967 | } |
2968 | break; |
2969 | |
2970 | case OP_ANYBYTE: |
2971 | if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH); |
2972 | Feptr += Lmin; |
2973 | break; |
2974 | |
2975 | case OP_ANYNL: |
2976 | for (i = 1; i <= Lmin; i++) |
2977 | { |
2978 | if (Feptr >= mb->end_subject) |
2979 | { |
2980 | SCHECK_PARTIAL(); |
2981 | RRETURN(MATCH_NOMATCH); |
2982 | } |
2983 | GETCHARINC(fc, Feptr); |
2984 | switch(fc) |
2985 | { |
2986 | default: RRETURN(MATCH_NOMATCH); |
2987 | |
2988 | case CHAR_CR: |
2989 | if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++; |
2990 | break; |
2991 | |
2992 | case CHAR_LF: |
2993 | break; |
2994 | |
2995 | case CHAR_VT: |
2996 | case CHAR_FF: |
2997 | case CHAR_NEL: |
2998 | #ifndef EBCDIC |
2999 | case 0x2028: |
3000 | case 0x2029: |
3001 | #endif /* Not EBCDIC */ |
3002 | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); |
3003 | break; |
3004 | } |
3005 | } |
3006 | break; |
3007 | |
3008 | case OP_NOT_HSPACE: |
3009 | for (i = 1; i <= Lmin; i++) |
3010 | { |
3011 | if (Feptr >= mb->end_subject) |
3012 | { |
3013 | SCHECK_PARTIAL(); |
3014 | RRETURN(MATCH_NOMATCH); |
3015 | } |
3016 | GETCHARINC(fc, Feptr); |
3017 | switch(fc) |
3018 | { |
3019 | HSPACE_CASES: RRETURN(MATCH_NOMATCH); |
3020 | default: break; |
3021 | } |
3022 | } |
3023 | break; |
3024 | |
3025 | case OP_HSPACE: |
3026 | for (i = 1; i <= Lmin; i++) |
3027 | { |
3028 | if (Feptr >= mb->end_subject) |
3029 | { |
3030 | SCHECK_PARTIAL(); |
3031 | RRETURN(MATCH_NOMATCH); |
3032 | } |
3033 | GETCHARINC(fc, Feptr); |
3034 | switch(fc) |
3035 | { |
3036 | HSPACE_CASES: break; |
3037 | default: RRETURN(MATCH_NOMATCH); |
3038 | } |
3039 | } |
3040 | break; |
3041 | |
3042 | case OP_NOT_VSPACE: |
3043 | for (i = 1; i <= Lmin; i++) |
3044 | { |
3045 | if (Feptr >= mb->end_subject) |
3046 | { |
3047 | SCHECK_PARTIAL(); |
3048 | RRETURN(MATCH_NOMATCH); |
3049 | } |
3050 | GETCHARINC(fc, Feptr); |
3051 | switch(fc) |
3052 | { |
3053 | VSPACE_CASES: RRETURN(MATCH_NOMATCH); |
3054 | default: break; |
3055 | } |
3056 | } |
3057 | break; |
3058 | |
3059 | case OP_VSPACE: |
3060 | for (i = 1; i <= Lmin; i++) |
3061 | { |
3062 | if (Feptr >= mb->end_subject) |
3063 | { |
3064 | SCHECK_PARTIAL(); |
3065 | RRETURN(MATCH_NOMATCH); |
3066 | } |
3067 | GETCHARINC(fc, Feptr); |
3068 | switch(fc) |
3069 | { |
3070 | VSPACE_CASES: break; |
3071 | default: RRETURN(MATCH_NOMATCH); |
3072 | } |
3073 | } |
3074 | break; |
3075 | |
3076 | case OP_NOT_DIGIT: |
3077 | for (i = 1; i <= Lmin; i++) |
3078 | { |
3079 | if (Feptr >= mb->end_subject) |
3080 | { |
3081 | SCHECK_PARTIAL(); |
3082 | RRETURN(MATCH_NOMATCH); |
3083 | } |
3084 | GETCHARINC(fc, Feptr); |
3085 | if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0) |
3086 | RRETURN(MATCH_NOMATCH); |
3087 | } |
3088 | break; |
3089 | |
3090 | case OP_DIGIT: |
3091 | for (i = 1; i <= Lmin; i++) |
3092 | { |
3093 | uint32_t cc; |
3094 | if (Feptr >= mb->end_subject) |
3095 | { |
3096 | SCHECK_PARTIAL(); |
3097 | RRETURN(MATCH_NOMATCH); |
3098 | } |
3099 | cc = UCHAR21(Feptr); |
3100 | if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0) |
3101 | RRETURN(MATCH_NOMATCH); |
3102 | Feptr++; |
3103 | /* No need to skip more code units - we know it has only one. */ |
3104 | } |
3105 | break; |
3106 | |
3107 | case OP_NOT_WHITESPACE: |
3108 | for (i = 1; i <= Lmin; i++) |
3109 | { |
3110 | uint32_t cc; |
3111 | if (Feptr >= mb->end_subject) |
3112 | { |
3113 | SCHECK_PARTIAL(); |
3114 | RRETURN(MATCH_NOMATCH); |
3115 | } |
3116 | cc = UCHAR21(Feptr); |
3117 | if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0) |
3118 | RRETURN(MATCH_NOMATCH); |
3119 | Feptr++; |
3120 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
3121 | } |
3122 | break; |
3123 | |
3124 | case OP_WHITESPACE: |
3125 | for (i = 1; i <= Lmin; i++) |
3126 | { |
3127 | uint32_t cc; |
3128 | if (Feptr >= mb->end_subject) |
3129 | { |
3130 | SCHECK_PARTIAL(); |
3131 | RRETURN(MATCH_NOMATCH); |
3132 | } |
3133 | cc = UCHAR21(Feptr); |
3134 | if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0) |
3135 | RRETURN(MATCH_NOMATCH); |
3136 | Feptr++; |
3137 | /* No need to skip more code units - we know it has only one. */ |
3138 | } |
3139 | break; |
3140 | |
3141 | case OP_NOT_WORDCHAR: |
3142 | for (i = 1; i <= Lmin; i++) |
3143 | { |
3144 | uint32_t cc; |
3145 | if (Feptr >= mb->end_subject) |
3146 | { |
3147 | SCHECK_PARTIAL(); |
3148 | RRETURN(MATCH_NOMATCH); |
3149 | } |
3150 | cc = UCHAR21(Feptr); |
3151 | if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0) |
3152 | RRETURN(MATCH_NOMATCH); |
3153 | Feptr++; |
3154 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
3155 | } |
3156 | break; |
3157 | |
3158 | case OP_WORDCHAR: |
3159 | for (i = 1; i <= Lmin; i++) |
3160 | { |
3161 | uint32_t cc; |
3162 | if (Feptr >= mb->end_subject) |
3163 | { |
3164 | SCHECK_PARTIAL(); |
3165 | RRETURN(MATCH_NOMATCH); |
3166 | } |
3167 | cc = UCHAR21(Feptr); |
3168 | if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0) |
3169 | RRETURN(MATCH_NOMATCH); |
3170 | Feptr++; |
3171 | /* No need to skip more code units - we know it has only one. */ |
3172 | } |
3173 | break; |
3174 | |
3175 | default: |
3176 | return PCRE2_ERROR_INTERNAL; |
3177 | } /* End switch(Lctype) */ |
3178 | |
3179 | else |
3180 | #endif /* SUPPORT_UNICODE */ |
3181 | |
3182 | /* Code for the non-UTF case for minimum matching of operators other |
3183 | than OP_PROP and OP_NOTPROP. */ |
3184 | |
3185 | switch(Lctype) |
3186 | { |
3187 | case OP_ANY: |
3188 | for (i = 1; i <= Lmin; i++) |
3189 | { |
3190 | if (Feptr >= mb->end_subject) |
3191 | { |
3192 | SCHECK_PARTIAL(); |
3193 | RRETURN(MATCH_NOMATCH); |
3194 | } |
3195 | if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); |
3196 | if (mb->partial != 0 && |
3197 | Feptr + 1 >= mb->end_subject && |
3198 | NLBLOCK->nltype == NLTYPE_FIXED && |
3199 | NLBLOCK->nllen == 2 && |
3200 | *Feptr == NLBLOCK->nl[0]) |
3201 | { |
3202 | mb->hitend = TRUE; |
3203 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
3204 | } |
3205 | Feptr++; |
3206 | } |
3207 | break; |
3208 | |
3209 | case OP_ALLANY: |
3210 | if (Feptr > mb->end_subject - Lmin) |
3211 | { |
3212 | SCHECK_PARTIAL(); |
3213 | RRETURN(MATCH_NOMATCH); |
3214 | } |
3215 | Feptr += Lmin; |
3216 | break; |
3217 | |
3218 | /* This OP_ANYBYTE case will never be reached because \C gets turned |
3219 | into OP_ALLANY in non-UTF mode. Cut out the code so that coverage |
3220 | reports don't complain about it's never being used. */ |
3221 | |
3222 | /* case OP_ANYBYTE: |
3223 | * if (Feptr > mb->end_subject - Lmin) |
3224 | * { |
3225 | * SCHECK_PARTIAL(); |
3226 | * RRETURN(MATCH_NOMATCH); |
3227 | * } |
3228 | * Feptr += Lmin; |
3229 | * break; |
3230 | */ |
3231 | case OP_ANYNL: |
3232 | for (i = 1; i <= Lmin; i++) |
3233 | { |
3234 | if (Feptr >= mb->end_subject) |
3235 | { |
3236 | SCHECK_PARTIAL(); |
3237 | RRETURN(MATCH_NOMATCH); |
3238 | } |
3239 | switch(*Feptr++) |
3240 | { |
3241 | default: RRETURN(MATCH_NOMATCH); |
3242 | |
3243 | case CHAR_CR: |
3244 | if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++; |
3245 | break; |
3246 | |
3247 | case CHAR_LF: |
3248 | break; |
3249 | |
3250 | case CHAR_VT: |
3251 | case CHAR_FF: |
3252 | case CHAR_NEL: |
3253 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3254 | case 0x2028: |
3255 | case 0x2029: |
3256 | #endif |
3257 | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH); |
3258 | break; |
3259 | } |
3260 | } |
3261 | break; |
3262 | |
3263 | case OP_NOT_HSPACE: |
3264 | for (i = 1; i <= Lmin; i++) |
3265 | { |
3266 | if (Feptr >= mb->end_subject) |
3267 | { |
3268 | SCHECK_PARTIAL(); |
3269 | RRETURN(MATCH_NOMATCH); |
3270 | } |
3271 | switch(*Feptr++) |
3272 | { |
3273 | default: break; |
3274 | HSPACE_BYTE_CASES: |
3275 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3276 | HSPACE_MULTIBYTE_CASES: |
3277 | #endif |
3278 | RRETURN(MATCH_NOMATCH); |
3279 | } |
3280 | } |
3281 | break; |
3282 | |
3283 | case OP_HSPACE: |
3284 | for (i = 1; i <= Lmin; i++) |
3285 | { |
3286 | if (Feptr >= mb->end_subject) |
3287 | { |
3288 | SCHECK_PARTIAL(); |
3289 | RRETURN(MATCH_NOMATCH); |
3290 | } |
3291 | switch(*Feptr++) |
3292 | { |
3293 | default: RRETURN(MATCH_NOMATCH); |
3294 | HSPACE_BYTE_CASES: |
3295 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3296 | HSPACE_MULTIBYTE_CASES: |
3297 | #endif |
3298 | break; |
3299 | } |
3300 | } |
3301 | break; |
3302 | |
3303 | case OP_NOT_VSPACE: |
3304 | for (i = 1; i <= Lmin; i++) |
3305 | { |
3306 | if (Feptr >= mb->end_subject) |
3307 | { |
3308 | SCHECK_PARTIAL(); |
3309 | RRETURN(MATCH_NOMATCH); |
3310 | } |
3311 | switch(*Feptr++) |
3312 | { |
3313 | VSPACE_BYTE_CASES: |
3314 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3315 | VSPACE_MULTIBYTE_CASES: |
3316 | #endif |
3317 | RRETURN(MATCH_NOMATCH); |
3318 | default: break; |
3319 | } |
3320 | } |
3321 | break; |
3322 | |
3323 | case OP_VSPACE: |
3324 | for (i = 1; i <= Lmin; i++) |
3325 | { |
3326 | if (Feptr >= mb->end_subject) |
3327 | { |
3328 | SCHECK_PARTIAL(); |
3329 | RRETURN(MATCH_NOMATCH); |
3330 | } |
3331 | switch(*Feptr++) |
3332 | { |
3333 | default: RRETURN(MATCH_NOMATCH); |
3334 | VSPACE_BYTE_CASES: |
3335 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3336 | VSPACE_MULTIBYTE_CASES: |
3337 | #endif |
3338 | break; |
3339 | } |
3340 | } |
3341 | break; |
3342 | |
3343 | case OP_NOT_DIGIT: |
3344 | for (i = 1; i <= Lmin; i++) |
3345 | { |
3346 | if (Feptr >= mb->end_subject) |
3347 | { |
3348 | SCHECK_PARTIAL(); |
3349 | RRETURN(MATCH_NOMATCH); |
3350 | } |
3351 | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0) |
3352 | RRETURN(MATCH_NOMATCH); |
3353 | Feptr++; |
3354 | } |
3355 | break; |
3356 | |
3357 | case OP_DIGIT: |
3358 | for (i = 1; i <= Lmin; i++) |
3359 | { |
3360 | if (Feptr >= mb->end_subject) |
3361 | { |
3362 | SCHECK_PARTIAL(); |
3363 | RRETURN(MATCH_NOMATCH); |
3364 | } |
3365 | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0) |
3366 | RRETURN(MATCH_NOMATCH); |
3367 | Feptr++; |
3368 | } |
3369 | break; |
3370 | |
3371 | case OP_NOT_WHITESPACE: |
3372 | for (i = 1; i <= Lmin; i++) |
3373 | { |
3374 | if (Feptr >= mb->end_subject) |
3375 | { |
3376 | SCHECK_PARTIAL(); |
3377 | RRETURN(MATCH_NOMATCH); |
3378 | } |
3379 | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0) |
3380 | RRETURN(MATCH_NOMATCH); |
3381 | Feptr++; |
3382 | } |
3383 | break; |
3384 | |
3385 | case OP_WHITESPACE: |
3386 | for (i = 1; i <= Lmin; i++) |
3387 | { |
3388 | if (Feptr >= mb->end_subject) |
3389 | { |
3390 | SCHECK_PARTIAL(); |
3391 | RRETURN(MATCH_NOMATCH); |
3392 | } |
3393 | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0) |
3394 | RRETURN(MATCH_NOMATCH); |
3395 | Feptr++; |
3396 | } |
3397 | break; |
3398 | |
3399 | case OP_NOT_WORDCHAR: |
3400 | for (i = 1; i <= Lmin; i++) |
3401 | { |
3402 | if (Feptr >= mb->end_subject) |
3403 | { |
3404 | SCHECK_PARTIAL(); |
3405 | RRETURN(MATCH_NOMATCH); |
3406 | } |
3407 | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0) |
3408 | RRETURN(MATCH_NOMATCH); |
3409 | Feptr++; |
3410 | } |
3411 | break; |
3412 | |
3413 | case OP_WORDCHAR: |
3414 | for (i = 1; i <= Lmin; i++) |
3415 | { |
3416 | if (Feptr >= mb->end_subject) |
3417 | { |
3418 | SCHECK_PARTIAL(); |
3419 | RRETURN(MATCH_NOMATCH); |
3420 | } |
3421 | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0) |
3422 | RRETURN(MATCH_NOMATCH); |
3423 | Feptr++; |
3424 | } |
3425 | break; |
3426 | |
3427 | default: |
3428 | return PCRE2_ERROR_INTERNAL; |
3429 | } |
3430 | } |
3431 | |
3432 | /* If Lmin = Lmax we are done. Continue with the main loop. */ |
3433 | |
3434 | if (Lmin == Lmax) continue; |
3435 | |
3436 | /* If minimizing, we have to test the rest of the pattern before each |
3437 | subsequent match. This means we cannot use a local "notmatch" variable as |
3438 | in the other cases. As all 4 temporary 32-bit values in the frame are |
3439 | already in use, just test the type each time. */ |
3440 | |
3441 | if (reptype == REPTYPE_MIN) |
3442 | { |
3443 | #ifdef SUPPORT_UNICODE |
3444 | if (proptype >= 0) |
3445 | { |
3446 | switch(proptype) |
3447 | { |
3448 | case PT_ANY: |
3449 | for (;;) |
3450 | { |
3451 | RMATCH(Fecode, RM208); |
3452 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3453 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3454 | if (Feptr >= mb->end_subject) |
3455 | { |
3456 | SCHECK_PARTIAL(); |
3457 | RRETURN(MATCH_NOMATCH); |
3458 | } |
3459 | GETCHARINCTEST(fc, Feptr); |
3460 | if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); |
3461 | } |
3462 | /* Control never gets here */ |
3463 | |
3464 | case PT_LAMP: |
3465 | for (;;) |
3466 | { |
3467 | int chartype; |
3468 | RMATCH(Fecode, RM209); |
3469 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3470 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3471 | if (Feptr >= mb->end_subject) |
3472 | { |
3473 | SCHECK_PARTIAL(); |
3474 | RRETURN(MATCH_NOMATCH); |
3475 | } |
3476 | GETCHARINCTEST(fc, Feptr); |
3477 | chartype = UCD_CHARTYPE(fc); |
3478 | if ((chartype == ucp_Lu || |
3479 | chartype == ucp_Ll || |
3480 | chartype == ucp_Lt) == (Lctype == OP_NOTPROP)) |
3481 | RRETURN(MATCH_NOMATCH); |
3482 | } |
3483 | /* Control never gets here */ |
3484 | |
3485 | case PT_GC: |
3486 | for (;;) |
3487 | { |
3488 | RMATCH(Fecode, RM210); |
3489 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3490 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3491 | if (Feptr >= mb->end_subject) |
3492 | { |
3493 | SCHECK_PARTIAL(); |
3494 | RRETURN(MATCH_NOMATCH); |
3495 | } |
3496 | GETCHARINCTEST(fc, Feptr); |
3497 | if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
3498 | RRETURN(MATCH_NOMATCH); |
3499 | } |
3500 | /* Control never gets here */ |
3501 | |
3502 | case PT_PC: |
3503 | for (;;) |
3504 | { |
3505 | RMATCH(Fecode, RM211); |
3506 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3507 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3508 | if (Feptr >= mb->end_subject) |
3509 | { |
3510 | SCHECK_PARTIAL(); |
3511 | RRETURN(MATCH_NOMATCH); |
3512 | } |
3513 | GETCHARINCTEST(fc, Feptr); |
3514 | if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
3515 | RRETURN(MATCH_NOMATCH); |
3516 | } |
3517 | /* Control never gets here */ |
3518 | |
3519 | case PT_SC: |
3520 | for (;;) |
3521 | { |
3522 | RMATCH(Fecode, RM212); |
3523 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3524 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3525 | if (Feptr >= mb->end_subject) |
3526 | { |
3527 | SCHECK_PARTIAL(); |
3528 | RRETURN(MATCH_NOMATCH); |
3529 | } |
3530 | GETCHARINCTEST(fc, Feptr); |
3531 | if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
3532 | RRETURN(MATCH_NOMATCH); |
3533 | } |
3534 | /* Control never gets here */ |
3535 | |
3536 | case PT_SCX: |
3537 | for (;;) |
3538 | { |
3539 | BOOL ok; |
3540 | const ucd_record *prop; |
3541 | RMATCH(Fecode, RM225); |
3542 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3543 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3544 | if (Feptr >= mb->end_subject) |
3545 | { |
3546 | SCHECK_PARTIAL(); |
3547 | RRETURN(MATCH_NOMATCH); |
3548 | } |
3549 | GETCHARINCTEST(fc, Feptr); |
3550 | prop = GET_UCD(fc); |
3551 | ok = (prop->script == Lpropvalue |
3552 | || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0); |
3553 | if (ok == (Lctype == OP_NOTPROP)) |
3554 | RRETURN(MATCH_NOMATCH); |
3555 | } |
3556 | /* Control never gets here */ |
3557 | |
3558 | case PT_ALNUM: |
3559 | for (;;) |
3560 | { |
3561 | int category; |
3562 | RMATCH(Fecode, RM213); |
3563 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3564 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3565 | if (Feptr >= mb->end_subject) |
3566 | { |
3567 | SCHECK_PARTIAL(); |
3568 | RRETURN(MATCH_NOMATCH); |
3569 | } |
3570 | GETCHARINCTEST(fc, Feptr); |
3571 | category = UCD_CATEGORY(fc); |
3572 | if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP)) |
3573 | RRETURN(MATCH_NOMATCH); |
3574 | } |
3575 | /* Control never gets here */ |
3576 | |
3577 | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
3578 | which means that Perl space and POSIX space are now identical. PCRE |
3579 | was changed at release 8.34. */ |
3580 | |
3581 | case PT_SPACE: /* Perl space */ |
3582 | case PT_PXSPACE: /* POSIX space */ |
3583 | for (;;) |
3584 | { |
3585 | RMATCH(Fecode, RM214); |
3586 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3587 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3588 | if (Feptr >= mb->end_subject) |
3589 | { |
3590 | SCHECK_PARTIAL(); |
3591 | RRETURN(MATCH_NOMATCH); |
3592 | } |
3593 | GETCHARINCTEST(fc, Feptr); |
3594 | switch(fc) |
3595 | { |
3596 | HSPACE_CASES: |
3597 | VSPACE_CASES: |
3598 | if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); |
3599 | break; |
3600 | |
3601 | default: |
3602 | if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP)) |
3603 | RRETURN(MATCH_NOMATCH); |
3604 | break; |
3605 | } |
3606 | } |
3607 | /* Control never gets here */ |
3608 | |
3609 | case PT_WORD: |
3610 | for (;;) |
3611 | { |
3612 | int category; |
3613 | RMATCH(Fecode, RM215); |
3614 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3615 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3616 | if (Feptr >= mb->end_subject) |
3617 | { |
3618 | SCHECK_PARTIAL(); |
3619 | RRETURN(MATCH_NOMATCH); |
3620 | } |
3621 | GETCHARINCTEST(fc, Feptr); |
3622 | category = UCD_CATEGORY(fc); |
3623 | if ((category == ucp_L || |
3624 | category == ucp_N || |
3625 | fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP)) |
3626 | RRETURN(MATCH_NOMATCH); |
3627 | } |
3628 | /* Control never gets here */ |
3629 | |
3630 | case PT_CLIST: |
3631 | for (;;) |
3632 | { |
3633 | const uint32_t *cp; |
3634 | RMATCH(Fecode, RM216); |
3635 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3636 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3637 | if (Feptr >= mb->end_subject) |
3638 | { |
3639 | SCHECK_PARTIAL(); |
3640 | RRETURN(MATCH_NOMATCH); |
3641 | } |
3642 | GETCHARINCTEST(fc, Feptr); |
3643 | cp = PRIV(ucd_caseless_sets) + Lpropvalue; |
3644 | for (;;) |
3645 | { |
3646 | if (fc < *cp) |
3647 | { |
3648 | if (Lctype == OP_NOTPROP) break; |
3649 | RRETURN(MATCH_NOMATCH); |
3650 | } |
3651 | if (fc == *cp++) |
3652 | { |
3653 | if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH); |
3654 | break; |
3655 | } |
3656 | } |
3657 | } |
3658 | /* Control never gets here */ |
3659 | |
3660 | case PT_UCNC: |
3661 | for (;;) |
3662 | { |
3663 | RMATCH(Fecode, RM217); |
3664 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3665 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3666 | if (Feptr >= mb->end_subject) |
3667 | { |
3668 | SCHECK_PARTIAL(); |
3669 | RRETURN(MATCH_NOMATCH); |
3670 | } |
3671 | GETCHARINCTEST(fc, Feptr); |
3672 | if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || |
3673 | fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || |
3674 | fc >= 0xe000) == (Lctype == OP_NOTPROP)) |
3675 | RRETURN(MATCH_NOMATCH); |
3676 | } |
3677 | /* Control never gets here */ |
3678 | |
3679 | case PT_BIDICL: |
3680 | for (;;) |
3681 | { |
3682 | RMATCH(Fecode, RM224); |
3683 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3684 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3685 | if (Feptr >= mb->end_subject) |
3686 | { |
3687 | SCHECK_PARTIAL(); |
3688 | RRETURN(MATCH_NOMATCH); |
3689 | } |
3690 | GETCHARINCTEST(fc, Feptr); |
3691 | if ((UCD_BIDICLASS(fc) == Lpropvalue) == (Lctype == OP_NOTPROP)) |
3692 | RRETURN(MATCH_NOMATCH); |
3693 | } |
3694 | /* Control never gets here */ |
3695 | |
3696 | case PT_BOOL: |
3697 | for (;;) |
3698 | { |
3699 | BOOL ok; |
3700 | const ucd_record *prop; |
3701 | RMATCH(Fecode, RM223); |
3702 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3703 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3704 | if (Feptr >= mb->end_subject) |
3705 | { |
3706 | SCHECK_PARTIAL(); |
3707 | RRETURN(MATCH_NOMATCH); |
3708 | } |
3709 | GETCHARINCTEST(fc, Feptr); |
3710 | prop = GET_UCD(fc); |
3711 | ok = MAPBIT(PRIV(ucd_boolprop_sets) + |
3712 | UCD_BPROPS_PROP(prop), Lpropvalue) != 0; |
3713 | if (ok == (Lctype == OP_NOTPROP)) |
3714 | RRETURN(MATCH_NOMATCH); |
3715 | } |
3716 | /* Control never gets here */ |
3717 | |
3718 | /* This should never occur */ |
3719 | default: |
3720 | return PCRE2_ERROR_INTERNAL; |
3721 | } |
3722 | } |
3723 | |
3724 | /* Match extended Unicode sequences. We will get here only if the |
3725 | support is in the binary; otherwise a compile-time error occurs. */ |
3726 | |
3727 | else if (Lctype == OP_EXTUNI) |
3728 | { |
3729 | for (;;) |
3730 | { |
3731 | RMATCH(Fecode, RM218); |
3732 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3733 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3734 | if (Feptr >= mb->end_subject) |
3735 | { |
3736 | SCHECK_PARTIAL(); |
3737 | RRETURN(MATCH_NOMATCH); |
3738 | } |
3739 | else |
3740 | { |
3741 | GETCHARINCTEST(fc, Feptr); |
3742 | Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, |
3743 | utf, NULL); |
3744 | } |
3745 | CHECK_PARTIAL(); |
3746 | } |
3747 | } |
3748 | else |
3749 | #endif /* SUPPORT_UNICODE */ |
3750 | |
3751 | /* UTF mode for non-property testing character types. */ |
3752 | |
3753 | #ifdef SUPPORT_UNICODE |
3754 | if (utf) |
3755 | { |
3756 | for (;;) |
3757 | { |
3758 | RMATCH(Fecode, RM219); |
3759 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3760 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3761 | if (Feptr >= mb->end_subject) |
3762 | { |
3763 | SCHECK_PARTIAL(); |
3764 | RRETURN(MATCH_NOMATCH); |
3765 | } |
3766 | if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH); |
3767 | GETCHARINC(fc, Feptr); |
3768 | switch(Lctype) |
3769 | { |
3770 | case OP_ANY: /* This is the non-NL case */ |
3771 | if (mb->partial != 0 && /* Take care with CRLF partial */ |
3772 | Feptr >= mb->end_subject && |
3773 | NLBLOCK->nltype == NLTYPE_FIXED && |
3774 | NLBLOCK->nllen == 2 && |
3775 | fc == NLBLOCK->nl[0]) |
3776 | { |
3777 | mb->hitend = TRUE; |
3778 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
3779 | } |
3780 | break; |
3781 | |
3782 | case OP_ALLANY: |
3783 | case OP_ANYBYTE: |
3784 | break; |
3785 | |
3786 | case OP_ANYNL: |
3787 | switch(fc) |
3788 | { |
3789 | default: RRETURN(MATCH_NOMATCH); |
3790 | |
3791 | case CHAR_CR: |
3792 | if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++; |
3793 | break; |
3794 | |
3795 | case CHAR_LF: |
3796 | break; |
3797 | |
3798 | case CHAR_VT: |
3799 | case CHAR_FF: |
3800 | case CHAR_NEL: |
3801 | #ifndef EBCDIC |
3802 | case 0x2028: |
3803 | case 0x2029: |
3804 | #endif /* Not EBCDIC */ |
3805 | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) |
3806 | RRETURN(MATCH_NOMATCH); |
3807 | break; |
3808 | } |
3809 | break; |
3810 | |
3811 | case OP_NOT_HSPACE: |
3812 | switch(fc) |
3813 | { |
3814 | HSPACE_CASES: RRETURN(MATCH_NOMATCH); |
3815 | default: break; |
3816 | } |
3817 | break; |
3818 | |
3819 | case OP_HSPACE: |
3820 | switch(fc) |
3821 | { |
3822 | HSPACE_CASES: break; |
3823 | default: RRETURN(MATCH_NOMATCH); |
3824 | } |
3825 | break; |
3826 | |
3827 | case OP_NOT_VSPACE: |
3828 | switch(fc) |
3829 | { |
3830 | VSPACE_CASES: RRETURN(MATCH_NOMATCH); |
3831 | default: break; |
3832 | } |
3833 | break; |
3834 | |
3835 | case OP_VSPACE: |
3836 | switch(fc) |
3837 | { |
3838 | VSPACE_CASES: break; |
3839 | default: RRETURN(MATCH_NOMATCH); |
3840 | } |
3841 | break; |
3842 | |
3843 | case OP_NOT_DIGIT: |
3844 | if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) |
3845 | RRETURN(MATCH_NOMATCH); |
3846 | break; |
3847 | |
3848 | case OP_DIGIT: |
3849 | if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0) |
3850 | RRETURN(MATCH_NOMATCH); |
3851 | break; |
3852 | |
3853 | case OP_NOT_WHITESPACE: |
3854 | if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) |
3855 | RRETURN(MATCH_NOMATCH); |
3856 | break; |
3857 | |
3858 | case OP_WHITESPACE: |
3859 | if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0) |
3860 | RRETURN(MATCH_NOMATCH); |
3861 | break; |
3862 | |
3863 | case OP_NOT_WORDCHAR: |
3864 | if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) |
3865 | RRETURN(MATCH_NOMATCH); |
3866 | break; |
3867 | |
3868 | case OP_WORDCHAR: |
3869 | if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) |
3870 | RRETURN(MATCH_NOMATCH); |
3871 | break; |
3872 | |
3873 | default: |
3874 | return PCRE2_ERROR_INTERNAL; |
3875 | } |
3876 | } |
3877 | } |
3878 | else |
3879 | #endif /* SUPPORT_UNICODE */ |
3880 | |
3881 | /* Not UTF mode */ |
3882 | { |
3883 | for (;;) |
3884 | { |
3885 | RMATCH(Fecode, RM33); |
3886 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
3887 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
3888 | if (Feptr >= mb->end_subject) |
3889 | { |
3890 | SCHECK_PARTIAL(); |
3891 | RRETURN(MATCH_NOMATCH); |
3892 | } |
3893 | if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) |
3894 | RRETURN(MATCH_NOMATCH); |
3895 | fc = *Feptr++; |
3896 | switch(Lctype) |
3897 | { |
3898 | case OP_ANY: /* This is the non-NL case */ |
3899 | if (mb->partial != 0 && /* Take care with CRLF partial */ |
3900 | Feptr >= mb->end_subject && |
3901 | NLBLOCK->nltype == NLTYPE_FIXED && |
3902 | NLBLOCK->nllen == 2 && |
3903 | fc == NLBLOCK->nl[0]) |
3904 | { |
3905 | mb->hitend = TRUE; |
3906 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
3907 | } |
3908 | break; |
3909 | |
3910 | case OP_ALLANY: |
3911 | case OP_ANYBYTE: |
3912 | break; |
3913 | |
3914 | case OP_ANYNL: |
3915 | switch(fc) |
3916 | { |
3917 | default: RRETURN(MATCH_NOMATCH); |
3918 | |
3919 | case CHAR_CR: |
3920 | if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++; |
3921 | break; |
3922 | |
3923 | case CHAR_LF: |
3924 | break; |
3925 | |
3926 | case CHAR_VT: |
3927 | case CHAR_FF: |
3928 | case CHAR_NEL: |
3929 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3930 | case 0x2028: |
3931 | case 0x2029: |
3932 | #endif |
3933 | if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) |
3934 | RRETURN(MATCH_NOMATCH); |
3935 | break; |
3936 | } |
3937 | break; |
3938 | |
3939 | case OP_NOT_HSPACE: |
3940 | switch(fc) |
3941 | { |
3942 | default: break; |
3943 | HSPACE_BYTE_CASES: |
3944 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3945 | HSPACE_MULTIBYTE_CASES: |
3946 | #endif |
3947 | RRETURN(MATCH_NOMATCH); |
3948 | } |
3949 | break; |
3950 | |
3951 | case OP_HSPACE: |
3952 | switch(fc) |
3953 | { |
3954 | default: RRETURN(MATCH_NOMATCH); |
3955 | HSPACE_BYTE_CASES: |
3956 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3957 | HSPACE_MULTIBYTE_CASES: |
3958 | #endif |
3959 | break; |
3960 | } |
3961 | break; |
3962 | |
3963 | case OP_NOT_VSPACE: |
3964 | switch(fc) |
3965 | { |
3966 | default: break; |
3967 | VSPACE_BYTE_CASES: |
3968 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3969 | VSPACE_MULTIBYTE_CASES: |
3970 | #endif |
3971 | RRETURN(MATCH_NOMATCH); |
3972 | } |
3973 | break; |
3974 | |
3975 | case OP_VSPACE: |
3976 | switch(fc) |
3977 | { |
3978 | default: RRETURN(MATCH_NOMATCH); |
3979 | VSPACE_BYTE_CASES: |
3980 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
3981 | VSPACE_MULTIBYTE_CASES: |
3982 | #endif |
3983 | break; |
3984 | } |
3985 | break; |
3986 | |
3987 | case OP_NOT_DIGIT: |
3988 | if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0) |
3989 | RRETURN(MATCH_NOMATCH); |
3990 | break; |
3991 | |
3992 | case OP_DIGIT: |
3993 | if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0) |
3994 | RRETURN(MATCH_NOMATCH); |
3995 | break; |
3996 | |
3997 | case OP_NOT_WHITESPACE: |
3998 | if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0) |
3999 | RRETURN(MATCH_NOMATCH); |
4000 | break; |
4001 | |
4002 | case OP_WHITESPACE: |
4003 | if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0) |
4004 | RRETURN(MATCH_NOMATCH); |
4005 | break; |
4006 | |
4007 | case OP_NOT_WORDCHAR: |
4008 | if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0) |
4009 | RRETURN(MATCH_NOMATCH); |
4010 | break; |
4011 | |
4012 | case OP_WORDCHAR: |
4013 | if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0) |
4014 | RRETURN(MATCH_NOMATCH); |
4015 | break; |
4016 | |
4017 | default: |
4018 | return PCRE2_ERROR_INTERNAL; |
4019 | } |
4020 | } |
4021 | } |
4022 | /* Control never gets here */ |
4023 | } |
4024 | |
4025 | /* If maximizing, it is worth using inline code for speed, doing the type |
4026 | test once at the start (i.e. keep it out of the loops). Once again, |
4027 | "notmatch" can be an ordinary local variable because the loops do not call |
4028 | RMATCH. */ |
4029 | |
4030 | else |
4031 | { |
4032 | Lstart_eptr = Feptr; /* Remember where we started */ |
4033 | |
4034 | #ifdef SUPPORT_UNICODE |
4035 | if (proptype >= 0) |
4036 | { |
4037 | BOOL notmatch = Lctype == OP_NOTPROP; |
4038 | switch(proptype) |
4039 | { |
4040 | case PT_ANY: |
4041 | for (i = Lmin; i < Lmax; i++) |
4042 | { |
4043 | int len = 1; |
4044 | if (Feptr >= mb->end_subject) |
4045 | { |
4046 | SCHECK_PARTIAL(); |
4047 | break; |
4048 | } |
4049 | GETCHARLENTEST(fc, Feptr, len); |
4050 | if (notmatch) break; |
4051 | Feptr+= len; |
4052 | } |
4053 | break; |
4054 | |
4055 | case PT_LAMP: |
4056 | for (i = Lmin; i < Lmax; i++) |
4057 | { |
4058 | int chartype; |
4059 | int len = 1; |
4060 | if (Feptr >= mb->end_subject) |
4061 | { |
4062 | SCHECK_PARTIAL(); |
4063 | break; |
4064 | } |
4065 | GETCHARLENTEST(fc, Feptr, len); |
4066 | chartype = UCD_CHARTYPE(fc); |
4067 | if ((chartype == ucp_Lu || |
4068 | chartype == ucp_Ll || |
4069 | chartype == ucp_Lt) == notmatch) |
4070 | break; |
4071 | Feptr+= len; |
4072 | } |
4073 | break; |
4074 | |
4075 | case PT_GC: |
4076 | for (i = Lmin; i < Lmax; i++) |
4077 | { |
4078 | int len = 1; |
4079 | if (Feptr >= mb->end_subject) |
4080 | { |
4081 | SCHECK_PARTIAL(); |
4082 | break; |
4083 | } |
4084 | GETCHARLENTEST(fc, Feptr, len); |
4085 | if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch) break; |
4086 | Feptr+= len; |
4087 | } |
4088 | break; |
4089 | |
4090 | case PT_PC: |
4091 | for (i = Lmin; i < Lmax; i++) |
4092 | { |
4093 | int len = 1; |
4094 | if (Feptr >= mb->end_subject) |
4095 | { |
4096 | SCHECK_PARTIAL(); |
4097 | break; |
4098 | } |
4099 | GETCHARLENTEST(fc, Feptr, len); |
4100 | if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch) break; |
4101 | Feptr+= len; |
4102 | } |
4103 | break; |
4104 | |
4105 | case PT_SC: |
4106 | for (i = Lmin; i < Lmax; i++) |
4107 | { |
4108 | int len = 1; |
4109 | if (Feptr >= mb->end_subject) |
4110 | { |
4111 | SCHECK_PARTIAL(); |
4112 | break; |
4113 | } |
4114 | GETCHARLENTEST(fc, Feptr, len); |
4115 | if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch) break; |
4116 | Feptr+= len; |
4117 | } |
4118 | break; |
4119 | |
4120 | case PT_SCX: |
4121 | for (i = Lmin; i < Lmax; i++) |
4122 | { |
4123 | BOOL ok; |
4124 | const ucd_record *prop; |
4125 | int len = 1; |
4126 | if (Feptr >= mb->end_subject) |
4127 | { |
4128 | SCHECK_PARTIAL(); |
4129 | break; |
4130 | } |
4131 | GETCHARLENTEST(fc, Feptr, len); |
4132 | prop = GET_UCD(fc); |
4133 | ok = (prop->script == Lpropvalue || |
4134 | MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0); |
4135 | if (ok == notmatch) break; |
4136 | Feptr+= len; |
4137 | } |
4138 | break; |
4139 | |
4140 | case PT_ALNUM: |
4141 | for (i = Lmin; i < Lmax; i++) |
4142 | { |
4143 | int category; |
4144 | int len = 1; |
4145 | if (Feptr >= mb->end_subject) |
4146 | { |
4147 | SCHECK_PARTIAL(); |
4148 | break; |
4149 | } |
4150 | GETCHARLENTEST(fc, Feptr, len); |
4151 | category = UCD_CATEGORY(fc); |
4152 | if ((category == ucp_L || category == ucp_N) == notmatch) |
4153 | break; |
4154 | Feptr+= len; |
4155 | } |
4156 | break; |
4157 | |
4158 | /* Perl space used to exclude VT, but from Perl 5.18 it is included, |
4159 | which means that Perl space and POSIX space are now identical. PCRE |
4160 | was changed at release 8.34. */ |
4161 | |
4162 | case PT_SPACE: /* Perl space */ |
4163 | case PT_PXSPACE: /* POSIX space */ |
4164 | for (i = Lmin; i < Lmax; i++) |
4165 | { |
4166 | int len = 1; |
4167 | if (Feptr >= mb->end_subject) |
4168 | { |
4169 | SCHECK_PARTIAL(); |
4170 | break; |
4171 | } |
4172 | GETCHARLENTEST(fc, Feptr, len); |
4173 | switch(fc) |
4174 | { |
4175 | HSPACE_CASES: |
4176 | VSPACE_CASES: |
4177 | if (notmatch) goto ENDLOOP99; /* Break the loop */ |
4178 | break; |
4179 | |
4180 | default: |
4181 | if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch) |
4182 | goto ENDLOOP99; /* Break the loop */ |
4183 | break; |
4184 | } |
4185 | Feptr+= len; |
4186 | } |
4187 | ENDLOOP99: |
4188 | break; |
4189 | |
4190 | case PT_WORD: |
4191 | for (i = Lmin; i < Lmax; i++) |
4192 | { |
4193 | int category; |
4194 | int len = 1; |
4195 | if (Feptr >= mb->end_subject) |
4196 | { |
4197 | SCHECK_PARTIAL(); |
4198 | break; |
4199 | } |
4200 | GETCHARLENTEST(fc, Feptr, len); |
4201 | category = UCD_CATEGORY(fc); |
4202 | if ((category == ucp_L || category == ucp_N || |
4203 | fc == CHAR_UNDERSCORE) == notmatch) |
4204 | break; |
4205 | Feptr+= len; |
4206 | } |
4207 | break; |
4208 | |
4209 | case PT_CLIST: |
4210 | for (i = Lmin; i < Lmax; i++) |
4211 | { |
4212 | const uint32_t *cp; |
4213 | int len = 1; |
4214 | if (Feptr >= mb->end_subject) |
4215 | { |
4216 | SCHECK_PARTIAL(); |
4217 | break; |
4218 | } |
4219 | GETCHARLENTEST(fc, Feptr, len); |
4220 | cp = PRIV(ucd_caseless_sets) + Lpropvalue; |
4221 | for (;;) |
4222 | { |
4223 | if (fc < *cp) |
4224 | { if (notmatch) break; else goto GOT_MAX; } |
4225 | if (fc == *cp++) |
4226 | { if (notmatch) goto GOT_MAX; else break; } |
4227 | } |
4228 | Feptr += len; |
4229 | } |
4230 | GOT_MAX: |
4231 | break; |
4232 | |
4233 | case PT_UCNC: |
4234 | for (i = Lmin; i < Lmax; i++) |
4235 | { |
4236 | int len = 1; |
4237 | if (Feptr >= mb->end_subject) |
4238 | { |
4239 | SCHECK_PARTIAL(); |
4240 | break; |
4241 | } |
4242 | GETCHARLENTEST(fc, Feptr, len); |
4243 | if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT || |
4244 | fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) || |
4245 | fc >= 0xe000) == notmatch) |
4246 | break; |
4247 | Feptr += len; |
4248 | } |
4249 | break; |
4250 | |
4251 | case PT_BIDICL: |
4252 | for (i = Lmin; i < Lmax; i++) |
4253 | { |
4254 | int len = 1; |
4255 | if (Feptr >= mb->end_subject) |
4256 | { |
4257 | SCHECK_PARTIAL(); |
4258 | break; |
4259 | } |
4260 | GETCHARLENTEST(fc, Feptr, len); |
4261 | if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch) break; |
4262 | Feptr+= len; |
4263 | } |
4264 | break; |
4265 | |
4266 | case PT_BOOL: |
4267 | for (i = Lmin; i < Lmax; i++) |
4268 | { |
4269 | BOOL ok; |
4270 | const ucd_record *prop; |
4271 | int len = 1; |
4272 | if (Feptr >= mb->end_subject) |
4273 | { |
4274 | SCHECK_PARTIAL(); |
4275 | break; |
4276 | } |
4277 | GETCHARLENTEST(fc, Feptr, len); |
4278 | prop = GET_UCD(fc); |
4279 | ok = MAPBIT(PRIV(ucd_boolprop_sets) + |
4280 | UCD_BPROPS_PROP(prop), Lpropvalue) != 0; |
4281 | if (ok == notmatch) break; |
4282 | Feptr+= len; |
4283 | } |
4284 | break; |
4285 | |
4286 | default: |
4287 | return PCRE2_ERROR_INTERNAL; |
4288 | } |
4289 | |
4290 | /* Feptr is now past the end of the maximum run */ |
4291 | |
4292 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
4293 | |
4294 | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
4295 | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't |
4296 | go too far. */ |
4297 | |
4298 | for(;;) |
4299 | { |
4300 | if (Feptr <= Lstart_eptr) break; |
4301 | RMATCH(Fecode, RM222); |
4302 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4303 | Feptr--; |
4304 | if (utf) BACKCHAR(Feptr); |
4305 | } |
4306 | } |
4307 | |
4308 | /* Match extended Unicode grapheme clusters. We will get here only if the |
4309 | support is in the binary; otherwise a compile-time error occurs. */ |
4310 | |
4311 | else if (Lctype == OP_EXTUNI) |
4312 | { |
4313 | for (i = Lmin; i < Lmax; i++) |
4314 | { |
4315 | if (Feptr >= mb->end_subject) |
4316 | { |
4317 | SCHECK_PARTIAL(); |
4318 | break; |
4319 | } |
4320 | else |
4321 | { |
4322 | GETCHARINCTEST(fc, Feptr); |
4323 | Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, |
4324 | utf, NULL); |
4325 | } |
4326 | CHECK_PARTIAL(); |
4327 | } |
4328 | |
4329 | /* Feptr is now past the end of the maximum run */ |
4330 | |
4331 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
4332 | |
4333 | /* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start |
4334 | of the run while backtracking because the use of \C in UTF mode can |
4335 | cause BACKCHAR to move back past Lstart_eptr. This is just palliative; |
4336 | the use of \C in UTF mode is fraught with danger. */ |
4337 | |
4338 | for(;;) |
4339 | { |
4340 | int lgb, rgb; |
4341 | PCRE2_SPTR fptr; |
4342 | |
4343 | if (Feptr <= Lstart_eptr) break; /* At start of char run */ |
4344 | RMATCH(Fecode, RM220); |
4345 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4346 | |
4347 | /* Backtracking over an extended grapheme cluster involves inspecting |
4348 | the previous two characters (if present) to see if a break is |
4349 | permitted between them. */ |
4350 | |
4351 | Feptr--; |
4352 | if (!utf) fc = *Feptr; else |
4353 | { |
4354 | BACKCHAR(Feptr); |
4355 | GETCHAR(fc, Feptr); |
4356 | } |
4357 | rgb = UCD_GRAPHBREAK(fc); |
4358 | |
4359 | for (;;) |
4360 | { |
4361 | if (Feptr <= Lstart_eptr) break; /* At start of char run */ |
4362 | fptr = Feptr - 1; |
4363 | if (!utf) fc = *fptr; else |
4364 | { |
4365 | BACKCHAR(fptr); |
4366 | GETCHAR(fc, fptr); |
4367 | } |
4368 | lgb = UCD_GRAPHBREAK(fc); |
4369 | if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; |
4370 | Feptr = fptr; |
4371 | rgb = lgb; |
4372 | } |
4373 | } |
4374 | } |
4375 | |
4376 | else |
4377 | #endif /* SUPPORT_UNICODE */ |
4378 | |
4379 | #ifdef SUPPORT_UNICODE |
4380 | if (utf) |
4381 | { |
4382 | switch(Lctype) |
4383 | { |
4384 | case OP_ANY: |
4385 | for (i = Lmin; i < Lmax; i++) |
4386 | { |
4387 | if (Feptr >= mb->end_subject) |
4388 | { |
4389 | SCHECK_PARTIAL(); |
4390 | break; |
4391 | } |
4392 | if (IS_NEWLINE(Feptr)) break; |
4393 | if (mb->partial != 0 && /* Take care with CRLF partial */ |
4394 | Feptr + 1 >= mb->end_subject && |
4395 | NLBLOCK->nltype == NLTYPE_FIXED && |
4396 | NLBLOCK->nllen == 2 && |
4397 | UCHAR21(Feptr) == NLBLOCK->nl[0]) |
4398 | { |
4399 | mb->hitend = TRUE; |
4400 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
4401 | } |
4402 | Feptr++; |
4403 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
4404 | } |
4405 | break; |
4406 | |
4407 | case OP_ALLANY: |
4408 | if (Lmax < UINT32_MAX) |
4409 | { |
4410 | for (i = Lmin; i < Lmax; i++) |
4411 | { |
4412 | if (Feptr >= mb->end_subject) |
4413 | { |
4414 | SCHECK_PARTIAL(); |
4415 | break; |
4416 | } |
4417 | Feptr++; |
4418 | ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++); |
4419 | } |
4420 | } |
4421 | else |
4422 | { |
4423 | Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */ |
4424 | SCHECK_PARTIAL(); |
4425 | } |
4426 | break; |
4427 | |
4428 | /* The "byte" (i.e. "code unit") case is the same as non-UTF */ |
4429 | |
4430 | case OP_ANYBYTE: |
4431 | fc = Lmax - Lmin; |
4432 | if (fc > (uint32_t)(mb->end_subject - Feptr)) |
4433 | { |
4434 | Feptr = mb->end_subject; |
4435 | SCHECK_PARTIAL(); |
4436 | } |
4437 | else Feptr += fc; |
4438 | break; |
4439 | |
4440 | case OP_ANYNL: |
4441 | for (i = Lmin; i < Lmax; i++) |
4442 | { |
4443 | int len = 1; |
4444 | if (Feptr >= mb->end_subject) |
4445 | { |
4446 | SCHECK_PARTIAL(); |
4447 | break; |
4448 | } |
4449 | GETCHARLEN(fc, Feptr, len); |
4450 | if (fc == CHAR_CR) |
4451 | { |
4452 | if (++Feptr >= mb->end_subject) break; |
4453 | if (UCHAR21(Feptr) == CHAR_LF) Feptr++; |
4454 | } |
4455 | else |
4456 | { |
4457 | if (fc != CHAR_LF && |
4458 | (mb->bsr_convention == PCRE2_BSR_ANYCRLF || |
4459 | (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL |
4460 | #ifndef EBCDIC |
4461 | && fc != 0x2028 && fc != 0x2029 |
4462 | #endif /* Not EBCDIC */ |
4463 | ))) |
4464 | break; |
4465 | Feptr += len; |
4466 | } |
4467 | } |
4468 | break; |
4469 | |
4470 | case OP_NOT_HSPACE: |
4471 | case OP_HSPACE: |
4472 | for (i = Lmin; i < Lmax; i++) |
4473 | { |
4474 | BOOL gotspace; |
4475 | int len = 1; |
4476 | if (Feptr >= mb->end_subject) |
4477 | { |
4478 | SCHECK_PARTIAL(); |
4479 | break; |
4480 | } |
4481 | GETCHARLEN(fc, Feptr, len); |
4482 | switch(fc) |
4483 | { |
4484 | HSPACE_CASES: gotspace = TRUE; break; |
4485 | default: gotspace = FALSE; break; |
4486 | } |
4487 | if (gotspace == (Lctype == OP_NOT_HSPACE)) break; |
4488 | Feptr += len; |
4489 | } |
4490 | break; |
4491 | |
4492 | case OP_NOT_VSPACE: |
4493 | case OP_VSPACE: |
4494 | for (i = Lmin; i < Lmax; i++) |
4495 | { |
4496 | BOOL gotspace; |
4497 | int len = 1; |
4498 | if (Feptr >= mb->end_subject) |
4499 | { |
4500 | SCHECK_PARTIAL(); |
4501 | break; |
4502 | } |
4503 | GETCHARLEN(fc, Feptr, len); |
4504 | switch(fc) |
4505 | { |
4506 | VSPACE_CASES: gotspace = TRUE; break; |
4507 | default: gotspace = FALSE; break; |
4508 | } |
4509 | if (gotspace == (Lctype == OP_NOT_VSPACE)) break; |
4510 | Feptr += len; |
4511 | } |
4512 | break; |
4513 | |
4514 | case OP_NOT_DIGIT: |
4515 | for (i = Lmin; i < Lmax; i++) |
4516 | { |
4517 | int len = 1; |
4518 | if (Feptr >= mb->end_subject) |
4519 | { |
4520 | SCHECK_PARTIAL(); |
4521 | break; |
4522 | } |
4523 | GETCHARLEN(fc, Feptr, len); |
4524 | if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break; |
4525 | Feptr+= len; |
4526 | } |
4527 | break; |
4528 | |
4529 | case OP_DIGIT: |
4530 | for (i = Lmin; i < Lmax; i++) |
4531 | { |
4532 | int len = 1; |
4533 | if (Feptr >= mb->end_subject) |
4534 | { |
4535 | SCHECK_PARTIAL(); |
4536 | break; |
4537 | } |
4538 | GETCHARLEN(fc, Feptr, len); |
4539 | if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break; |
4540 | Feptr+= len; |
4541 | } |
4542 | break; |
4543 | |
4544 | case OP_NOT_WHITESPACE: |
4545 | for (i = Lmin; i < Lmax; i++) |
4546 | { |
4547 | int len = 1; |
4548 | if (Feptr >= mb->end_subject) |
4549 | { |
4550 | SCHECK_PARTIAL(); |
4551 | break; |
4552 | } |
4553 | GETCHARLEN(fc, Feptr, len); |
4554 | if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break; |
4555 | Feptr+= len; |
4556 | } |
4557 | break; |
4558 | |
4559 | case OP_WHITESPACE: |
4560 | for (i = Lmin; i < Lmax; i++) |
4561 | { |
4562 | int len = 1; |
4563 | if (Feptr >= mb->end_subject) |
4564 | { |
4565 | SCHECK_PARTIAL(); |
4566 | break; |
4567 | } |
4568 | GETCHARLEN(fc, Feptr, len); |
4569 | if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break; |
4570 | Feptr+= len; |
4571 | } |
4572 | break; |
4573 | |
4574 | case OP_NOT_WORDCHAR: |
4575 | for (i = Lmin; i < Lmax; i++) |
4576 | { |
4577 | int len = 1; |
4578 | if (Feptr >= mb->end_subject) |
4579 | { |
4580 | SCHECK_PARTIAL(); |
4581 | break; |
4582 | } |
4583 | GETCHARLEN(fc, Feptr, len); |
4584 | if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break; |
4585 | Feptr+= len; |
4586 | } |
4587 | break; |
4588 | |
4589 | case OP_WORDCHAR: |
4590 | for (i = Lmin; i < Lmax; i++) |
4591 | { |
4592 | int len = 1; |
4593 | if (Feptr >= mb->end_subject) |
4594 | { |
4595 | SCHECK_PARTIAL(); |
4596 | break; |
4597 | } |
4598 | GETCHARLEN(fc, Feptr, len); |
4599 | if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break; |
4600 | Feptr+= len; |
4601 | } |
4602 | break; |
4603 | |
4604 | default: |
4605 | return PCRE2_ERROR_INTERNAL; |
4606 | } |
4607 | |
4608 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
4609 | |
4610 | /* After \C in UTF mode, Lstart_eptr might be in the middle of a |
4611 | Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go |
4612 | too far. */ |
4613 | |
4614 | for(;;) |
4615 | { |
4616 | if (Feptr <= Lstart_eptr) break; |
4617 | RMATCH(Fecode, RM221); |
4618 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4619 | Feptr--; |
4620 | BACKCHAR(Feptr); |
4621 | if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && |
4622 | UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR) |
4623 | Feptr--; |
4624 | } |
4625 | } |
4626 | else |
4627 | #endif /* SUPPORT_UNICODE */ |
4628 | |
4629 | /* Not UTF mode */ |
4630 | { |
4631 | switch(Lctype) |
4632 | { |
4633 | case OP_ANY: |
4634 | for (i = Lmin; i < Lmax; i++) |
4635 | { |
4636 | if (Feptr >= mb->end_subject) |
4637 | { |
4638 | SCHECK_PARTIAL(); |
4639 | break; |
4640 | } |
4641 | if (IS_NEWLINE(Feptr)) break; |
4642 | if (mb->partial != 0 && /* Take care with CRLF partial */ |
4643 | Feptr + 1 >= mb->end_subject && |
4644 | NLBLOCK->nltype == NLTYPE_FIXED && |
4645 | NLBLOCK->nllen == 2 && |
4646 | *Feptr == NLBLOCK->nl[0]) |
4647 | { |
4648 | mb->hitend = TRUE; |
4649 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
4650 | } |
4651 | Feptr++; |
4652 | } |
4653 | break; |
4654 | |
4655 | case OP_ALLANY: |
4656 | case OP_ANYBYTE: |
4657 | fc = Lmax - Lmin; |
4658 | if (fc > (uint32_t)(mb->end_subject - Feptr)) |
4659 | { |
4660 | Feptr = mb->end_subject; |
4661 | SCHECK_PARTIAL(); |
4662 | } |
4663 | else Feptr += fc; |
4664 | break; |
4665 | |
4666 | case OP_ANYNL: |
4667 | for (i = Lmin; i < Lmax; i++) |
4668 | { |
4669 | if (Feptr >= mb->end_subject) |
4670 | { |
4671 | SCHECK_PARTIAL(); |
4672 | break; |
4673 | } |
4674 | fc = *Feptr; |
4675 | if (fc == CHAR_CR) |
4676 | { |
4677 | if (++Feptr >= mb->end_subject) break; |
4678 | if (*Feptr == CHAR_LF) Feptr++; |
4679 | } |
4680 | else |
4681 | { |
4682 | if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF || |
4683 | (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL |
4684 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4685 | && fc != 0x2028 && fc != 0x2029 |
4686 | #endif |
4687 | ))) break; |
4688 | Feptr++; |
4689 | } |
4690 | } |
4691 | break; |
4692 | |
4693 | case OP_NOT_HSPACE: |
4694 | for (i = Lmin; i < Lmax; i++) |
4695 | { |
4696 | if (Feptr >= mb->end_subject) |
4697 | { |
4698 | SCHECK_PARTIAL(); |
4699 | break; |
4700 | } |
4701 | switch(*Feptr) |
4702 | { |
4703 | default: Feptr++; break; |
4704 | HSPACE_BYTE_CASES: |
4705 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4706 | HSPACE_MULTIBYTE_CASES: |
4707 | #endif |
4708 | goto ENDLOOP00; |
4709 | } |
4710 | } |
4711 | ENDLOOP00: |
4712 | break; |
4713 | |
4714 | case OP_HSPACE: |
4715 | for (i = Lmin; i < Lmax; i++) |
4716 | { |
4717 | if (Feptr >= mb->end_subject) |
4718 | { |
4719 | SCHECK_PARTIAL(); |
4720 | break; |
4721 | } |
4722 | switch(*Feptr) |
4723 | { |
4724 | default: goto ENDLOOP01; |
4725 | HSPACE_BYTE_CASES: |
4726 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4727 | HSPACE_MULTIBYTE_CASES: |
4728 | #endif |
4729 | Feptr++; break; |
4730 | } |
4731 | } |
4732 | ENDLOOP01: |
4733 | break; |
4734 | |
4735 | case OP_NOT_VSPACE: |
4736 | for (i = Lmin; i < Lmax; i++) |
4737 | { |
4738 | if (Feptr >= mb->end_subject) |
4739 | { |
4740 | SCHECK_PARTIAL(); |
4741 | break; |
4742 | } |
4743 | switch(*Feptr) |
4744 | { |
4745 | default: Feptr++; break; |
4746 | VSPACE_BYTE_CASES: |
4747 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4748 | VSPACE_MULTIBYTE_CASES: |
4749 | #endif |
4750 | goto ENDLOOP02; |
4751 | } |
4752 | } |
4753 | ENDLOOP02: |
4754 | break; |
4755 | |
4756 | case OP_VSPACE: |
4757 | for (i = Lmin; i < Lmax; i++) |
4758 | { |
4759 | if (Feptr >= mb->end_subject) |
4760 | { |
4761 | SCHECK_PARTIAL(); |
4762 | break; |
4763 | } |
4764 | switch(*Feptr) |
4765 | { |
4766 | default: goto ENDLOOP03; |
4767 | VSPACE_BYTE_CASES: |
4768 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
4769 | VSPACE_MULTIBYTE_CASES: |
4770 | #endif |
4771 | Feptr++; break; |
4772 | } |
4773 | } |
4774 | ENDLOOP03: |
4775 | break; |
4776 | |
4777 | case OP_NOT_DIGIT: |
4778 | for (i = Lmin; i < Lmax; i++) |
4779 | { |
4780 | if (Feptr >= mb->end_subject) |
4781 | { |
4782 | SCHECK_PARTIAL(); |
4783 | break; |
4784 | } |
4785 | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0) |
4786 | break; |
4787 | Feptr++; |
4788 | } |
4789 | break; |
4790 | |
4791 | case OP_DIGIT: |
4792 | for (i = Lmin; i < Lmax; i++) |
4793 | { |
4794 | if (Feptr >= mb->end_subject) |
4795 | { |
4796 | SCHECK_PARTIAL(); |
4797 | break; |
4798 | } |
4799 | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0) |
4800 | break; |
4801 | Feptr++; |
4802 | } |
4803 | break; |
4804 | |
4805 | case OP_NOT_WHITESPACE: |
4806 | for (i = Lmin; i < Lmax; i++) |
4807 | { |
4808 | if (Feptr >= mb->end_subject) |
4809 | { |
4810 | SCHECK_PARTIAL(); |
4811 | break; |
4812 | } |
4813 | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0) |
4814 | break; |
4815 | Feptr++; |
4816 | } |
4817 | break; |
4818 | |
4819 | case OP_WHITESPACE: |
4820 | for (i = Lmin; i < Lmax; i++) |
4821 | { |
4822 | if (Feptr >= mb->end_subject) |
4823 | { |
4824 | SCHECK_PARTIAL(); |
4825 | break; |
4826 | } |
4827 | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0) |
4828 | break; |
4829 | Feptr++; |
4830 | } |
4831 | break; |
4832 | |
4833 | case OP_NOT_WORDCHAR: |
4834 | for (i = Lmin; i < Lmax; i++) |
4835 | { |
4836 | if (Feptr >= mb->end_subject) |
4837 | { |
4838 | SCHECK_PARTIAL(); |
4839 | break; |
4840 | } |
4841 | if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0) |
4842 | break; |
4843 | Feptr++; |
4844 | } |
4845 | break; |
4846 | |
4847 | case OP_WORDCHAR: |
4848 | for (i = Lmin; i < Lmax; i++) |
4849 | { |
4850 | if (Feptr >= mb->end_subject) |
4851 | { |
4852 | SCHECK_PARTIAL(); |
4853 | break; |
4854 | } |
4855 | if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0) |
4856 | break; |
4857 | Feptr++; |
4858 | } |
4859 | break; |
4860 | |
4861 | default: |
4862 | return PCRE2_ERROR_INTERNAL; |
4863 | } |
4864 | |
4865 | if (reptype == REPTYPE_POS) continue; /* No backtracking */ |
4866 | |
4867 | for (;;) |
4868 | { |
4869 | if (Feptr == Lstart_eptr) break; |
4870 | RMATCH(Fecode, RM34); |
4871 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
4872 | Feptr--; |
4873 | if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF && |
4874 | Feptr[-1] == CHAR_CR) Feptr--; |
4875 | } |
4876 | } |
4877 | } |
4878 | break; /* End of repeat character type processing */ |
4879 | |
4880 | #undef Lstart_eptr |
4881 | #undef Lmin |
4882 | #undef Lmax |
4883 | #undef Lctype |
4884 | #undef Lpropvalue |
4885 | |
4886 | |
4887 | /* ===================================================================== */ |
4888 | /* Match a back reference, possibly repeatedly. Look past the end of the |
4889 | item to see if there is repeat information following. The OP_REF and |
4890 | OP_REFI opcodes are used for a reference to a numbered group or to a |
4891 | non-duplicated named group. For a duplicated named group, OP_DNREF and |
4892 | OP_DNREFI are used. In this case we must scan the list of groups to which |
4893 | the name refers, and use the first one that is set. */ |
4894 | |
4895 | #define Lmin F->temp_32[0] |
4896 | #define Lmax F->temp_32[1] |
4897 | #define Lcaseless F->temp_32[2] |
4898 | #define Lstart F->temp_sptr[0] |
4899 | #define Loffset F->temp_size |
4900 | |
4901 | case OP_DNREF: |
4902 | case OP_DNREFI: |
4903 | Lcaseless = (Fop == OP_DNREFI); |
4904 | { |
4905 | int count = GET2(Fecode, 1+IMM2_SIZE); |
4906 | PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; |
4907 | Fecode += 1 + 2*IMM2_SIZE; |
4908 | |
4909 | while (count-- > 0) |
4910 | { |
4911 | Loffset = (GET2(slot, 0) << 1) - 2; |
4912 | if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break; |
4913 | slot += mb->name_entry_size; |
4914 | } |
4915 | } |
4916 | goto REF_REPEAT; |
4917 | |
4918 | case OP_REF: |
4919 | case OP_REFI: |
4920 | Lcaseless = (Fop == OP_REFI); |
4921 | Loffset = (GET2(Fecode, 1) << 1) - 2; |
4922 | Fecode += 1 + IMM2_SIZE; |
4923 | |
4924 | /* Set up for repetition, or handle the non-repeated case. The maximum and |
4925 | minimum must be in the heap frame, but as they are short-term values, we |
4926 | use temporary fields. */ |
4927 | |
4928 | REF_REPEAT: |
4929 | switch (*Fecode) |
4930 | { |
4931 | case OP_CRSTAR: |
4932 | case OP_CRMINSTAR: |
4933 | case OP_CRPLUS: |
4934 | case OP_CRMINPLUS: |
4935 | case OP_CRQUERY: |
4936 | case OP_CRMINQUERY: |
4937 | fc = *Fecode++ - OP_CRSTAR; |
4938 | Lmin = rep_min[fc]; |
4939 | Lmax = rep_max[fc]; |
4940 | reptype = rep_typ[fc]; |
4941 | break; |
4942 | |
4943 | case OP_CRRANGE: |
4944 | case OP_CRMINRANGE: |
4945 | Lmin = GET2(Fecode, 1); |
4946 | Lmax = GET2(Fecode, 1 + IMM2_SIZE); |
4947 | reptype = rep_typ[*Fecode - OP_CRSTAR]; |
4948 | if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */ |
4949 | Fecode += 1 + 2 * IMM2_SIZE; |
4950 | break; |
4951 | |
4952 | default: /* No repeat follows */ |
4953 | { |
4954 | rrc = match_ref(Loffset, Lcaseless, F, mb, &length); |
4955 | if (rrc != 0) |
4956 | { |
4957 | if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ |
4958 | CHECK_PARTIAL(); |
4959 | RRETURN(MATCH_NOMATCH); |
4960 | } |
4961 | } |
4962 | Feptr += length; |
4963 | continue; /* With the main loop */ |
4964 | } |
4965 | |
4966 | /* Handle repeated back references. If a set group has length zero, just |
4967 | continue with the main loop, because it matches however many times. For an |
4968 | unset reference, if the minimum is zero, we can also just continue. We can |
4969 | also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset |
4970 | group behave as a zero-length group. For any other unset cases, carrying |
4971 | on will result in NOMATCH. */ |
4972 | |
4973 | if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) |
4974 | { |
4975 | if (Fovector[Loffset] == Fovector[Loffset + 1]) continue; |
4976 | } |
4977 | else /* Group is not set */ |
4978 | { |
4979 | if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0) |
4980 | continue; |
4981 | } |
4982 | |
4983 | /* First, ensure the minimum number of matches are present. */ |
4984 | |
4985 | for (i = 1; i <= Lmin; i++) |
4986 | { |
4987 | PCRE2_SIZE slength; |
4988 | rrc = match_ref(Loffset, Lcaseless, F, mb, &slength); |
4989 | if (rrc != 0) |
4990 | { |
4991 | if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ |
4992 | CHECK_PARTIAL(); |
4993 | RRETURN(MATCH_NOMATCH); |
4994 | } |
4995 | Feptr += slength; |
4996 | } |
4997 | |
4998 | /* If min = max, we are done. They are not both allowed to be zero. */ |
4999 | |
5000 | if (Lmin == Lmax) continue; |
5001 | |
5002 | /* If minimizing, keep trying and advancing the pointer. */ |
5003 | |
5004 | if (reptype == REPTYPE_MIN) |
5005 | { |
5006 | for (;;) |
5007 | { |
5008 | PCRE2_SIZE slength; |
5009 | RMATCH(Fecode, RM20); |
5010 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5011 | if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); |
5012 | rrc = match_ref(Loffset, Lcaseless, F, mb, &slength); |
5013 | if (rrc != 0) |
5014 | { |
5015 | if (rrc > 0) Feptr = mb->end_subject; /* Partial match */ |
5016 | CHECK_PARTIAL(); |
5017 | RRETURN(MATCH_NOMATCH); |
5018 | } |
5019 | Feptr += slength; |
5020 | } |
5021 | /* Control never gets here */ |
5022 | } |
5023 | |
5024 | /* If maximizing, find the longest string and work backwards, as long as |
5025 | the matched lengths for each iteration are the same. */ |
5026 | |
5027 | else |
5028 | { |
5029 | BOOL samelengths = TRUE; |
5030 | Lstart = Feptr; /* Starting position */ |
5031 | Flength = Fovector[Loffset+1] - Fovector[Loffset]; |
5032 | |
5033 | for (i = Lmin; i < Lmax; i++) |
5034 | { |
5035 | PCRE2_SIZE slength; |
5036 | rrc = match_ref(Loffset, Lcaseless, F, mb, &slength); |
5037 | if (rrc != 0) |
5038 | { |
5039 | /* Can't use CHECK_PARTIAL because we don't want to update Feptr in |
5040 | the soft partial matching case. */ |
5041 | |
5042 | if (rrc > 0 && mb->partial != 0 && |
5043 | mb->end_subject > mb->start_used_ptr) |
5044 | { |
5045 | mb->hitend = TRUE; |
5046 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
5047 | } |
5048 | break; |
5049 | } |
5050 | |
5051 | if (slength != Flength) samelengths = FALSE; |
5052 | Feptr += slength; |
5053 | } |
5054 | |
5055 | /* If the length matched for each repetition is the same as the length of |
5056 | the captured group, we can easily work backwards. This is the normal |
5057 | case. However, in caseless UTF-8 mode there are pairs of case-equivalent |
5058 | characters whose lengths (in terms of code units) differ. However, this |
5059 | is very rare, so we handle it by re-matching fewer and fewer times. */ |
5060 | |
5061 | if (samelengths) |
5062 | { |
5063 | while (Feptr >= Lstart) |
5064 | { |
5065 | RMATCH(Fecode, RM21); |
5066 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5067 | Feptr -= Flength; |
5068 | } |
5069 | } |
5070 | |
5071 | /* The rare case of non-matching lengths. Re-scan the repetition for each |
5072 | iteration. We know that match_ref() will succeed every time. */ |
5073 | |
5074 | else |
5075 | { |
5076 | Lmax = i; |
5077 | for (;;) |
5078 | { |
5079 | RMATCH(Fecode, RM22); |
5080 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5081 | if (Feptr == Lstart) break; /* Failed after minimal repetition */ |
5082 | Feptr = Lstart; |
5083 | Lmax--; |
5084 | for (i = Lmin; i < Lmax; i++) |
5085 | { |
5086 | PCRE2_SIZE slength; |
5087 | (void)match_ref(Loffset, Lcaseless, F, mb, &slength); |
5088 | Feptr += slength; |
5089 | } |
5090 | } |
5091 | } |
5092 | |
5093 | RRETURN(MATCH_NOMATCH); |
5094 | } |
5095 | /* Control never gets here */ |
5096 | |
5097 | #undef Lcaseless |
5098 | #undef Lmin |
5099 | #undef Lmax |
5100 | #undef Lstart |
5101 | #undef Loffset |
5102 | |
5103 | |
5104 | |
5105 | /* ========================================================================= */ |
5106 | /* Opcodes for the start of various parenthesized items */ |
5107 | /* ========================================================================= */ |
5108 | |
5109 | /* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the |
5110 | (*THEN) is within the current branch by comparing the address of OP_THEN |
5111 | that is passed back with the end of the branch. If (*THEN) is within the |
5112 | current branch, and the branch is one of two or more alternatives (it |
5113 | either starts or ends with OP_ALT), we have reached the limit of THEN's |
5114 | action, so convert the return code to NOMATCH, which will cause normal |
5115 | backtracking to happen from now on. Otherwise, THEN is passed back to an |
5116 | outer alternative. This implements Perl's treatment of parenthesized |
5117 | groups, where a group not containing | does not affect the current |
5118 | alternative, that is, (X) is NOT the same as (X|(*F)). */ |
5119 | |
5120 | |
5121 | /* ===================================================================== */ |
5122 | /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive |
5123 | bracket group, indicating that it may occur zero times. It may repeat |
5124 | infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in |
5125 | the pattern. Brackets with fixed upper repeat limits are compiled as a |
5126 | number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO. |
5127 | Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */ |
5128 | |
5129 | #define Lnext_ecode F->temp_sptr[0] |
5130 | |
5131 | case OP_BRAZERO: |
5132 | Lnext_ecode = Fecode + 1; |
5133 | RMATCH(Lnext_ecode, RM9); |
5134 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5135 | do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT); |
5136 | Fecode = Lnext_ecode + 1 + LINK_SIZE; |
5137 | break; |
5138 | |
5139 | case OP_BRAMINZERO: |
5140 | Lnext_ecode = Fecode + 1; |
5141 | do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT); |
5142 | RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10); |
5143 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5144 | Fecode++; |
5145 | break; |
5146 | |
5147 | #undef Lnext_ecode |
5148 | |
5149 | case OP_SKIPZERO: |
5150 | Fecode++; |
5151 | do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT); |
5152 | Fecode += 1 + LINK_SIZE; |
5153 | break; |
5154 | |
5155 | |
5156 | /* ===================================================================== */ |
5157 | /* Handle possessive brackets with an unlimited repeat. The end of these |
5158 | brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without |
5159 | going further in the pattern. */ |
5160 | |
5161 | #define Lframe_type F->temp_32[0] |
5162 | #define Lmatched_once F->temp_32[1] |
5163 | #define Lzero_allowed F->temp_32[2] |
5164 | #define Lstart_eptr F->temp_sptr[0] |
5165 | #define Lstart_group F->temp_sptr[1] |
5166 | |
5167 | case OP_BRAPOSZERO: |
5168 | Lzero_allowed = TRUE; /* Zero repeat is allowed */ |
5169 | Fecode += 1; |
5170 | if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS) |
5171 | goto POSSESSIVE_CAPTURE; |
5172 | goto POSSESSIVE_NON_CAPTURE; |
5173 | |
5174 | case OP_BRAPOS: |
5175 | case OP_SBRAPOS: |
5176 | Lzero_allowed = FALSE; /* Zero repeat not allowed */ |
5177 | |
5178 | POSSESSIVE_NON_CAPTURE: |
5179 | Lframe_type = GF_NOCAPTURE; /* Remembered frame type */ |
5180 | goto POSSESSIVE_GROUP; |
5181 | |
5182 | case OP_CBRAPOS: |
5183 | case OP_SCBRAPOS: |
5184 | Lzero_allowed = FALSE; /* Zero repeat not allowed */ |
5185 | |
5186 | POSSESSIVE_CAPTURE: |
5187 | number = GET2(Fecode, 1+LINK_SIZE); |
5188 | Lframe_type = GF_CAPTURE | number; /* Remembered frame type */ |
5189 | |
5190 | POSSESSIVE_GROUP: |
5191 | Lmatched_once = FALSE; /* Never matched */ |
5192 | Lstart_group = Fecode; /* Start of this group */ |
5193 | |
5194 | for (;;) |
5195 | { |
5196 | Lstart_eptr = Feptr; /* Position at group start */ |
5197 | group_frame_type = Lframe_type; |
5198 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8); |
5199 | if (rrc == MATCH_KETRPOS) |
5200 | { |
5201 | Lmatched_once = TRUE; /* Matched at least once */ |
5202 | if (Feptr == Lstart_eptr) /* Empty match; skip to end */ |
5203 | { |
5204 | do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); |
5205 | break; |
5206 | } |
5207 | |
5208 | Fecode = Lstart_group; |
5209 | continue; |
5210 | } |
5211 | |
5212 | /* See comment above about handling THEN. */ |
5213 | |
5214 | if (rrc == MATCH_THEN) |
5215 | { |
5216 | PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1); |
5217 | if (mb->verb_ecode_ptr < next_ecode && |
5218 | (*Fecode == OP_ALT || *next_ecode == OP_ALT)) |
5219 | rrc = MATCH_NOMATCH; |
5220 | } |
5221 | |
5222 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5223 | Fecode += GET(Fecode, 1); |
5224 | if (*Fecode != OP_ALT) break; |
5225 | } |
5226 | |
5227 | /* Success if matched something or zero repeat allowed */ |
5228 | |
5229 | if (Lmatched_once || Lzero_allowed) |
5230 | { |
5231 | Fecode += 1 + LINK_SIZE; |
5232 | break; |
5233 | } |
5234 | |
5235 | RRETURN(MATCH_NOMATCH); |
5236 | |
5237 | #undef Lmatched_once |
5238 | #undef Lzero_allowed |
5239 | #undef Lframe_type |
5240 | #undef Lstart_eptr |
5241 | #undef Lstart_group |
5242 | |
5243 | |
5244 | /* ===================================================================== */ |
5245 | /* Handle non-capturing brackets that cannot match an empty string. When we |
5246 | get to the final alternative within the brackets, as long as there are no |
5247 | THEN's in the pattern, we can optimize by not recording a new backtracking |
5248 | point. (Ideally we should test for a THEN within this group, but we don't |
5249 | have that information.) Don't do this if we are at the very top level, |
5250 | however, because that would make handling assertions and once-only brackets |
5251 | messier when there is nothing to go back to. */ |
5252 | |
5253 | #define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */ |
5254 | #define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */ |
5255 | |
5256 | case OP_BRA: |
5257 | if (mb->hasthen || Frdepth == 0) |
5258 | { |
5259 | Lframe_type = 0; |
5260 | goto GROUPLOOP; |
5261 | } |
5262 | |
5263 | for (;;) |
5264 | { |
5265 | Lnext_branch = Fecode + GET(Fecode, 1); |
5266 | if (*Lnext_branch != OP_ALT) break; |
5267 | |
5268 | /* This is never the final branch. We do not need to test for MATCH_THEN |
5269 | here because this code is not used when there is a THEN in the pattern. */ |
5270 | |
5271 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1); |
5272 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5273 | Fecode = Lnext_branch; |
5274 | } |
5275 | |
5276 | /* Hit the start of the final branch. Continue at this level. */ |
5277 | |
5278 | Fecode += PRIV(OP_lengths)[*Fecode]; |
5279 | break; |
5280 | |
5281 | #undef Lnext_branch |
5282 | |
5283 | |
5284 | /* ===================================================================== */ |
5285 | /* Handle a capturing bracket, other than those that are possessive with an |
5286 | unlimited repeat. */ |
5287 | |
5288 | case OP_CBRA: |
5289 | case OP_SCBRA: |
5290 | Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE); |
5291 | goto GROUPLOOP; |
5292 | |
5293 | |
5294 | /* ===================================================================== */ |
5295 | /* Atomic groups and non-capturing brackets that can match an empty string |
5296 | must record a backtracking point and also set up a chained frame. */ |
5297 | |
5298 | case OP_ONCE: |
5299 | case OP_SCRIPT_RUN: |
5300 | case OP_SBRA: |
5301 | Lframe_type = GF_NOCAPTURE | Fop; |
5302 | |
5303 | GROUPLOOP: |
5304 | for (;;) |
5305 | { |
5306 | group_frame_type = Lframe_type; |
5307 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2); |
5308 | if (rrc == MATCH_THEN) |
5309 | { |
5310 | PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1); |
5311 | if (mb->verb_ecode_ptr < next_ecode && |
5312 | (*Fecode == OP_ALT || *next_ecode == OP_ALT)) |
5313 | rrc = MATCH_NOMATCH; |
5314 | } |
5315 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5316 | Fecode += GET(Fecode, 1); |
5317 | if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH); |
5318 | } |
5319 | /* Control never reaches here. */ |
5320 | |
5321 | #undef Lframe_type |
5322 | |
5323 | |
5324 | /* ===================================================================== */ |
5325 | /* Recursion either matches the current regex, or some subexpression. The |
5326 | offset data is the offset to the starting bracket from the start of the |
5327 | whole pattern. (This is so that it works from duplicated subpatterns.) */ |
5328 | |
5329 | #define Lframe_type F->temp_32[0] |
5330 | #define Lstart_branch F->temp_sptr[0] |
5331 | |
5332 | case OP_RECURSE: |
5333 | bracode = mb->start_code + GET(Fecode, 1); |
5334 | number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE); |
5335 | |
5336 | /* If we are already in a recursion, check for repeating the same one |
5337 | without advancing the subject pointer. This should catch convoluted mutual |
5338 | recursions. (Some simple cases are caught at compile time.) */ |
5339 | |
5340 | if (Fcurrent_recurse != RECURSE_UNSET) |
5341 | { |
5342 | offset = Flast_group_offset; |
5343 | while (offset != PCRE2_UNSET) |
5344 | { |
5345 | N = (heapframe *)((char *)match_data->heapframes + offset); |
5346 | P = (heapframe *)((char *)N - frame_size); |
5347 | if (N->group_frame_type == (GF_RECURSE | number)) |
5348 | { |
5349 | if (Feptr == P->eptr) return PCRE2_ERROR_RECURSELOOP; |
5350 | break; |
5351 | } |
5352 | offset = P->last_group_offset; |
5353 | } |
5354 | } |
5355 | |
5356 | /* Now run the recursion, branch by branch. */ |
5357 | |
5358 | Lstart_branch = bracode; |
5359 | Lframe_type = GF_RECURSE | number; |
5360 | |
5361 | for (;;) |
5362 | { |
5363 | PCRE2_SPTR next_ecode; |
5364 | |
5365 | group_frame_type = Lframe_type; |
5366 | RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11); |
5367 | next_ecode = Lstart_branch + GET(Lstart_branch,1); |
5368 | |
5369 | /* Handle backtracking verbs, which are defined in a range that can |
5370 | easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to |
5371 | escape beyond a recursion; they cause a NOMATCH for the entire recursion. |
5372 | |
5373 | When one of these verbs triggers, the current recursion group number is |
5374 | recorded. If it matches the recursion we are processing, the verb |
5375 | happened within the recursion and we must deal with it. Otherwise it must |
5376 | have happened after the recursion completed, and so has to be passed |
5377 | back. See comment above about handling THEN. */ |
5378 | |
5379 | if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX && |
5380 | mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE)) |
5381 | { |
5382 | if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode && |
5383 | (*Lstart_branch == OP_ALT || *next_ecode == OP_ALT)) |
5384 | rrc = MATCH_NOMATCH; |
5385 | else RRETURN(MATCH_NOMATCH); |
5386 | } |
5387 | |
5388 | /* Note that carrying on after (*ACCEPT) in a recursion is handled in the |
5389 | OP_ACCEPT code. Nothing needs to be done here. */ |
5390 | |
5391 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5392 | Lstart_branch = next_ecode; |
5393 | if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH); |
5394 | } |
5395 | /* Control never reaches here. */ |
5396 | |
5397 | #undef Lframe_type |
5398 | #undef Lstart_branch |
5399 | |
5400 | |
5401 | /* ===================================================================== */ |
5402 | /* Positive assertions are like other groups except that PCRE doesn't allow |
5403 | the effect of (*THEN) to escape beyond an assertion; it is therefore |
5404 | treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its |
5405 | captures and mark retained. Any other return is an error. */ |
5406 | |
5407 | #define Lframe_type F->temp_32[0] |
5408 | |
5409 | case OP_ASSERT: |
5410 | case OP_ASSERTBACK: |
5411 | case OP_ASSERT_NA: |
5412 | case OP_ASSERTBACK_NA: |
5413 | Lframe_type = GF_NOCAPTURE | Fop; |
5414 | for (;;) |
5415 | { |
5416 | group_frame_type = Lframe_type; |
5417 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3); |
5418 | if (rrc == MATCH_ACCEPT) |
5419 | { |
5420 | memcpy(Fovector, |
5421 | (char *)assert_accept_frame + offsetof(heapframe, ovector), |
5422 | assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); |
5423 | Foffset_top = assert_accept_frame->offset_top; |
5424 | Fmark = assert_accept_frame->mark; |
5425 | break; |
5426 | } |
5427 | if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); |
5428 | Fecode += GET(Fecode, 1); |
5429 | if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH); |
5430 | } |
5431 | |
5432 | do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); |
5433 | Fecode += 1 + LINK_SIZE; |
5434 | break; |
5435 | |
5436 | #undef Lframe_type |
5437 | |
5438 | |
5439 | /* ===================================================================== */ |
5440 | /* Handle negative assertions. Loop for each non-matching branch as for |
5441 | positive assertions. */ |
5442 | |
5443 | #define Lframe_type F->temp_32[0] |
5444 | |
5445 | case OP_ASSERT_NOT: |
5446 | case OP_ASSERTBACK_NOT: |
5447 | Lframe_type = GF_NOCAPTURE | Fop; |
5448 | |
5449 | for (;;) |
5450 | { |
5451 | group_frame_type = Lframe_type; |
5452 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4); |
5453 | switch(rrc) |
5454 | { |
5455 | case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */ |
5456 | case MATCH_MATCH: |
5457 | RRETURN (MATCH_NOMATCH); |
5458 | |
5459 | case MATCH_NOMATCH: /* Branch failed, try next if present. */ |
5460 | case MATCH_THEN: |
5461 | Fecode += GET(Fecode, 1); |
5462 | if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED; |
5463 | break; |
5464 | |
5465 | case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */ |
5466 | case MATCH_SKIP: |
5467 | case MATCH_PRUNE: |
5468 | do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); |
5469 | goto ASSERT_NOT_FAILED; |
5470 | |
5471 | default: /* Pass back any other return */ |
5472 | RRETURN(rrc); |
5473 | } |
5474 | } |
5475 | |
5476 | /* None of the branches have matched or there was a backtrack to (*COMMIT), |
5477 | (*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a |
5478 | negative assertion, so carry on. */ |
5479 | |
5480 | ASSERT_NOT_FAILED: |
5481 | Fecode += 1 + LINK_SIZE; |
5482 | break; |
5483 | |
5484 | #undef Lframe_type |
5485 | |
5486 | |
5487 | /* ===================================================================== */ |
5488 | /* The callout item calls an external function, if one is provided, passing |
5489 | details of the match so far. This is mainly for debugging, though the |
5490 | function is able to force a failure. */ |
5491 | |
5492 | case OP_CALLOUT: |
5493 | case OP_CALLOUT_STR: |
5494 | rrc = do_callout(F, mb, &length); |
5495 | if (rrc > 0) RRETURN(MATCH_NOMATCH); |
5496 | if (rrc < 0) RRETURN(rrc); |
5497 | Fecode += length; |
5498 | break; |
5499 | |
5500 | |
5501 | /* ===================================================================== */ |
5502 | /* Conditional group: compilation checked that there are no more than two |
5503 | branches. If the condition is false, skipping the first branch takes us |
5504 | past the end of the item if there is only one branch, but that's exactly |
5505 | what we want. */ |
5506 | |
5507 | case OP_COND: |
5508 | case OP_SCOND: |
5509 | |
5510 | /* The variable Flength will be added to Fecode when the condition is |
5511 | false, to get to the second branch. Setting it to the offset to the ALT or |
5512 | KET, then incrementing Fecode achieves this effect. However, if the second |
5513 | branch is non-existent, we must point to the KET so that the end of the |
5514 | group is correctly processed. We now have Fecode pointing to the condition |
5515 | or callout. */ |
5516 | |
5517 | Flength = GET(Fecode, 1); /* Offset to the second branch */ |
5518 | if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE; |
5519 | Fecode += 1 + LINK_SIZE; /* From this opcode */ |
5520 | |
5521 | /* Because of the way auto-callout works during compile, a callout item is |
5522 | inserted between OP_COND and an assertion condition. Such a callout can |
5523 | also be inserted manually. */ |
5524 | |
5525 | if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR) |
5526 | { |
5527 | rrc = do_callout(F, mb, &length); |
5528 | if (rrc > 0) RRETURN(MATCH_NOMATCH); |
5529 | if (rrc < 0) RRETURN(rrc); |
5530 | |
5531 | /* Advance Fecode past the callout, so it now points to the condition. We |
5532 | must adjust Flength so that the value of Fecode+Flength is unchanged. */ |
5533 | |
5534 | Fecode += length; |
5535 | Flength -= length; |
5536 | } |
5537 | |
5538 | /* Test the various possible conditions */ |
5539 | |
5540 | condition = FALSE; |
5541 | switch(*Fecode) |
5542 | { |
5543 | case OP_RREF: /* Group recursion test */ |
5544 | if (Fcurrent_recurse != RECURSE_UNSET) |
5545 | { |
5546 | number = GET2(Fecode, 1); |
5547 | condition = (number == RREF_ANY || number == Fcurrent_recurse); |
5548 | } |
5549 | break; |
5550 | |
5551 | case OP_DNRREF: /* Duplicate named group recursion test */ |
5552 | if (Fcurrent_recurse != RECURSE_UNSET) |
5553 | { |
5554 | int count = GET2(Fecode, 1 + IMM2_SIZE); |
5555 | PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; |
5556 | while (count-- > 0) |
5557 | { |
5558 | number = GET2(slot, 0); |
5559 | condition = number == Fcurrent_recurse; |
5560 | if (condition) break; |
5561 | slot += mb->name_entry_size; |
5562 | } |
5563 | } |
5564 | break; |
5565 | |
5566 | case OP_CREF: /* Numbered group used test */ |
5567 | offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */ |
5568 | condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET; |
5569 | break; |
5570 | |
5571 | case OP_DNCREF: /* Duplicate named group used test */ |
5572 | { |
5573 | int count = GET2(Fecode, 1 + IMM2_SIZE); |
5574 | PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size; |
5575 | while (count-- > 0) |
5576 | { |
5577 | offset = (GET2(slot, 0) << 1) - 2; |
5578 | condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET; |
5579 | if (condition) break; |
5580 | slot += mb->name_entry_size; |
5581 | } |
5582 | } |
5583 | break; |
5584 | |
5585 | case OP_FALSE: |
5586 | case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */ |
5587 | break; |
5588 | |
5589 | case OP_TRUE: |
5590 | condition = TRUE; |
5591 | break; |
5592 | |
5593 | /* The condition is an assertion. Run code similar to the assertion code |
5594 | above. */ |
5595 | |
5596 | #define Lpositive F->temp_32[0] |
5597 | #define Lstart_branch F->temp_sptr[0] |
5598 | |
5599 | default: |
5600 | Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK); |
5601 | Lstart_branch = Fecode; |
5602 | |
5603 | for (;;) |
5604 | { |
5605 | group_frame_type = GF_CONDASSERT | *Fecode; |
5606 | RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5); |
5607 | |
5608 | switch(rrc) |
5609 | { |
5610 | case MATCH_ACCEPT: /* Save captures */ |
5611 | memcpy(Fovector, |
5612 | (char *)assert_accept_frame + offsetof(heapframe, ovector), |
5613 | assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); |
5614 | Foffset_top = assert_accept_frame->offset_top; |
5615 | |
5616 | /* Fall through */ |
5617 | /* In the case of a match, the captures have already been put into |
5618 | the current frame. */ |
5619 | |
5620 | case MATCH_MATCH: |
5621 | condition = Lpositive; /* TRUE for positive assertion */ |
5622 | break; |
5623 | |
5624 | /* PCRE doesn't allow the effect of (*THEN) to escape beyond an |
5625 | assertion; it is therefore always treated as NOMATCH. */ |
5626 | |
5627 | case MATCH_NOMATCH: |
5628 | case MATCH_THEN: |
5629 | Lstart_branch += GET(Lstart_branch, 1); |
5630 | if (*Lstart_branch == OP_ALT) continue; /* Try next branch */ |
5631 | condition = !Lpositive; /* TRUE for negative assertion */ |
5632 | break; |
5633 | |
5634 | /* These force no match without checking other branches. */ |
5635 | |
5636 | case MATCH_COMMIT: |
5637 | case MATCH_SKIP: |
5638 | case MATCH_PRUNE: |
5639 | condition = !Lpositive; |
5640 | break; |
5641 | |
5642 | default: |
5643 | RRETURN(rrc); |
5644 | } |
5645 | break; /* Out of the branch loop */ |
5646 | } |
5647 | |
5648 | /* If the condition is true, find the end of the assertion so that |
5649 | advancing past it gets us to the start of the first branch. */ |
5650 | |
5651 | if (condition) |
5652 | { |
5653 | do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); |
5654 | } |
5655 | break; /* End of assertion condition */ |
5656 | } |
5657 | |
5658 | #undef Lpositive |
5659 | #undef Lstart_branch |
5660 | |
5661 | /* Choose branch according to the condition. */ |
5662 | |
5663 | Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength; |
5664 | |
5665 | /* If the opcode is OP_SCOND it means we are at a repeated conditional |
5666 | group that might match an empty string. We must therefore descend a level |
5667 | so that the start is remembered for checking. For OP_COND we can just |
5668 | continue at this level. */ |
5669 | |
5670 | if (Fop == OP_SCOND) |
5671 | { |
5672 | group_frame_type = GF_NOCAPTURE | Fop; |
5673 | RMATCH(Fecode, RM35); |
5674 | RRETURN(rrc); |
5675 | } |
5676 | break; |
5677 | |
5678 | |
5679 | |
5680 | /* ========================================================================= */ |
5681 | /* End of start of parenthesis opcodes */ |
5682 | /* ========================================================================= */ |
5683 | |
5684 | |
5685 | /* ===================================================================== */ |
5686 | /* Move the subject pointer back. This occurs only at the start of each |
5687 | branch of a lookbehind assertion. If we are too close to the start to move |
5688 | back, fail. When working with UTF-8 we move back a number of characters, |
5689 | not bytes. */ |
5690 | |
5691 | case OP_REVERSE: |
5692 | number = GET(Fecode, 1); |
5693 | #ifdef SUPPORT_UNICODE |
5694 | if (utf) |
5695 | { |
5696 | while (number-- > 0) |
5697 | { |
5698 | if (Feptr <= mb->check_subject) RRETURN(MATCH_NOMATCH); |
5699 | Feptr--; |
5700 | BACKCHAR(Feptr); |
5701 | } |
5702 | } |
5703 | else |
5704 | #endif |
5705 | |
5706 | /* No UTF-8 support, or not in UTF-8 mode: count is code unit count */ |
5707 | |
5708 | { |
5709 | if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH); |
5710 | Feptr -= number; |
5711 | } |
5712 | |
5713 | /* Save the earliest consulted character, then skip to next opcode */ |
5714 | |
5715 | if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr; |
5716 | Fecode += 1 + LINK_SIZE; |
5717 | break; |
5718 | |
5719 | |
5720 | /* ===================================================================== */ |
5721 | /* An alternation is the end of a branch; scan along to find the end of the |
5722 | bracketed group. */ |
5723 | |
5724 | case OP_ALT: |
5725 | do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT); |
5726 | break; |
5727 | |
5728 | |
5729 | /* ===================================================================== */ |
5730 | /* The end of a parenthesized group. For all but OP_BRA and OP_COND, the |
5731 | starting frame was added to the chained frames in order to remember the |
5732 | starting subject position for the group. */ |
5733 | |
5734 | case OP_KET: |
5735 | case OP_KETRMIN: |
5736 | case OP_KETRMAX: |
5737 | case OP_KETRPOS: |
5738 | |
5739 | bracode = Fecode - GET(Fecode, 1); |
5740 | |
5741 | /* Point N to the frame at the start of the most recent group. |
5742 | Remember the subject pointer at the start of the group. */ |
5743 | |
5744 | if (*bracode != OP_BRA && *bracode != OP_COND) |
5745 | { |
5746 | N = (heapframe *)((char *)match_data->heapframes + Flast_group_offset); |
5747 | P = (heapframe *)((char *)N - frame_size); |
5748 | Flast_group_offset = P->last_group_offset; |
5749 | |
5750 | #ifdef DEBUG_SHOW_RMATCH |
5751 | fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n" , |
5752 | N->rdepth, N->group_frame_type, |
5753 | (char *)P->eptr - (char *)mb->start_subject); |
5754 | #endif |
5755 | |
5756 | /* If we are at the end of an assertion that is a condition, return a |
5757 | match, discarding any intermediate backtracking points. Copy back the |
5758 | mark setting and the captures into the frame before N so that they are |
5759 | set on return. Doing this for all assertions, both positive and negative, |
5760 | seems to match what Perl does. */ |
5761 | |
5762 | if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT) |
5763 | { |
5764 | memcpy((char *)P + offsetof(heapframe, ovector), Fovector, |
5765 | Foffset_top * sizeof(PCRE2_SIZE)); |
5766 | P->offset_top = Foffset_top; |
5767 | P->mark = Fmark; |
5768 | Fback_frame = (char *)F - (char *)P; |
5769 | RRETURN(MATCH_MATCH); |
5770 | } |
5771 | } |
5772 | else P = NULL; /* Indicates starting frame not recorded */ |
5773 | |
5774 | /* The group was not a conditional assertion. */ |
5775 | |
5776 | switch (*bracode) |
5777 | { |
5778 | case OP_BRA: /* No need to do anything for these */ |
5779 | case OP_COND: |
5780 | case OP_SCOND: |
5781 | break; |
5782 | |
5783 | /* Non-atomic positive assertions are like OP_BRA, except that the |
5784 | subject pointer must be put back to where it was at the start of the |
5785 | assertion. */ |
5786 | |
5787 | case OP_ASSERT_NA: |
5788 | case OP_ASSERTBACK_NA: |
5789 | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
5790 | Feptr = P->eptr; |
5791 | break; |
5792 | |
5793 | /* Atomic positive assertions are like OP_ONCE, except that in addition |
5794 | the subject pointer must be put back to where it was at the start of the |
5795 | assertion. */ |
5796 | |
5797 | case OP_ASSERT: |
5798 | case OP_ASSERTBACK: |
5799 | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
5800 | Feptr = P->eptr; |
5801 | /* Fall through */ |
5802 | |
5803 | /* For an atomic group, discard internal backtracking points. We must |
5804 | also ensure that any remaining branches within the top-level of the group |
5805 | are not tried. Do this by adjusting the code pointer within the backtrack |
5806 | frame so that it points to the final branch. */ |
5807 | |
5808 | case OP_ONCE: |
5809 | Fback_frame = ((char *)F - (char *)P); |
5810 | for (;;) |
5811 | { |
5812 | uint32_t y = GET(P->ecode,1); |
5813 | if ((P->ecode)[y] != OP_ALT) break; |
5814 | P->ecode += y; |
5815 | } |
5816 | break; |
5817 | |
5818 | /* A matching negative assertion returns MATCH, which is turned into |
5819 | NOMATCH at the assertion level. */ |
5820 | |
5821 | case OP_ASSERT_NOT: |
5822 | case OP_ASSERTBACK_NOT: |
5823 | RRETURN(MATCH_MATCH); |
5824 | |
5825 | /* At the end of a script run, apply the script-checking rules. This code |
5826 | will never by exercised if Unicode support it not compiled, because in |
5827 | that environment script runs cause an error at compile time. */ |
5828 | |
5829 | case OP_SCRIPT_RUN: |
5830 | if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH); |
5831 | break; |
5832 | |
5833 | /* Whole-pattern recursion is coded as a recurse into group 0, so it |
5834 | won't be picked up here. Instead, we catch it when the OP_END is reached. |
5835 | Other recursion is handled here. */ |
5836 | |
5837 | case OP_CBRA: |
5838 | case OP_CBRAPOS: |
5839 | case OP_SCBRA: |
5840 | case OP_SCBRAPOS: |
5841 | number = GET2(bracode, 1+LINK_SIZE); |
5842 | |
5843 | /* Handle a recursively called group. We reinstate the previous set of |
5844 | captures and then carry on after the recursion call. */ |
5845 | |
5846 | if (Fcurrent_recurse == number) |
5847 | { |
5848 | P = (heapframe *)((char *)N - frame_size); |
5849 | memcpy((char *)F + offsetof(heapframe, ovector), P->ovector, |
5850 | P->offset_top * sizeof(PCRE2_SIZE)); |
5851 | Foffset_top = P->offset_top; |
5852 | Fcapture_last = P->capture_last; |
5853 | Fcurrent_recurse = P->current_recurse; |
5854 | Fecode = P->ecode + 1 + LINK_SIZE; |
5855 | continue; /* With next opcode */ |
5856 | } |
5857 | |
5858 | /* Deal with actual capturing. */ |
5859 | |
5860 | offset = (number << 1) - 2; |
5861 | Fcapture_last = number; |
5862 | Fovector[offset] = P->eptr - mb->start_subject; |
5863 | Fovector[offset+1] = Feptr - mb->start_subject; |
5864 | if (offset >= Foffset_top) Foffset_top = offset + 2; |
5865 | break; |
5866 | } /* End actions relating to the starting opcode */ |
5867 | |
5868 | /* OP_KETRPOS is a possessive repeating ket. Remember the current position, |
5869 | and return the MATCH_KETRPOS. This makes it possible to do the repeats one |
5870 | at a time from the outer level. This must precede the empty string test - |
5871 | in this case that test is done at the outer level. */ |
5872 | |
5873 | if (*Fecode == OP_KETRPOS) |
5874 | { |
5875 | memcpy((char *)P + offsetof(heapframe, eptr), |
5876 | (char *)F + offsetof(heapframe, eptr), |
5877 | frame_copy_size); |
5878 | RRETURN(MATCH_KETRPOS); |
5879 | } |
5880 | |
5881 | /* Handle the different kinds of closing brackets. A non-repeating ket |
5882 | needs no special action, just continuing at this level. This also happens |
5883 | for the repeating kets if the group matched no characters, in order to |
5884 | forcibly break infinite loops. Otherwise, the repeating kets try the rest |
5885 | of the pattern or restart from the preceding bracket, in the appropriate |
5886 | order. */ |
5887 | |
5888 | if (Fop != OP_KET && (P == NULL || Feptr != P->eptr)) |
5889 | { |
5890 | if (Fop == OP_KETRMIN) |
5891 | { |
5892 | RMATCH(Fecode + 1 + LINK_SIZE, RM6); |
5893 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5894 | Fecode -= GET(Fecode, 1); |
5895 | break; /* End of ket processing */ |
5896 | } |
5897 | |
5898 | /* Repeat the maximum number of times (KETRMAX) */ |
5899 | |
5900 | RMATCH(bracode, RM7); |
5901 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
5902 | } |
5903 | |
5904 | /* Carry on at this level for a non-repeating ket, or after matching an |
5905 | empty string, or after repeating for a maximum number of times. */ |
5906 | |
5907 | Fecode += 1 + LINK_SIZE; |
5908 | break; |
5909 | |
5910 | |
5911 | /* ===================================================================== */ |
5912 | /* Start and end of line assertions, not multiline mode. */ |
5913 | |
5914 | case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */ |
5915 | if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0) |
5916 | RRETURN(MATCH_NOMATCH); |
5917 | Fecode++; |
5918 | break; |
5919 | |
5920 | case OP_SOD: /* Unconditional start of subject */ |
5921 | if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH); |
5922 | Fecode++; |
5923 | break; |
5924 | |
5925 | /* When PCRE2_NOTEOL is unset, assert before the subject end, or a |
5926 | terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */ |
5927 | |
5928 | case OP_DOLL: |
5929 | if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH); |
5930 | if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS; |
5931 | |
5932 | /* Fall through */ |
5933 | /* Unconditional end of subject assertion (\z) */ |
5934 | |
5935 | case OP_EOD: |
5936 | if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH); |
5937 | if (mb->partial != 0) |
5938 | { |
5939 | mb->hitend = TRUE; |
5940 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
5941 | } |
5942 | Fecode++; |
5943 | break; |
5944 | |
5945 | /* End of subject or ending \n assertion (\Z) */ |
5946 | |
5947 | case OP_EODN: |
5948 | ASSERT_NL_OR_EOS: |
5949 | if (Feptr < mb->end_subject && |
5950 | (!IS_NEWLINE(Feptr) || Feptr != mb->end_subject - mb->nllen)) |
5951 | { |
5952 | if (mb->partial != 0 && |
5953 | Feptr + 1 >= mb->end_subject && |
5954 | NLBLOCK->nltype == NLTYPE_FIXED && |
5955 | NLBLOCK->nllen == 2 && |
5956 | UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) |
5957 | { |
5958 | mb->hitend = TRUE; |
5959 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
5960 | } |
5961 | RRETURN(MATCH_NOMATCH); |
5962 | } |
5963 | |
5964 | /* Either at end of string or \n before end. */ |
5965 | |
5966 | if (mb->partial != 0) |
5967 | { |
5968 | mb->hitend = TRUE; |
5969 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
5970 | } |
5971 | Fecode++; |
5972 | break; |
5973 | |
5974 | |
5975 | /* ===================================================================== */ |
5976 | /* Start and end of line assertions, multiline mode. */ |
5977 | |
5978 | /* Start of subject unless notbol, or after any newline except for one at |
5979 | the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */ |
5980 | |
5981 | case OP_CIRCM: |
5982 | if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject) |
5983 | RRETURN(MATCH_NOMATCH); |
5984 | if (Feptr != mb->start_subject && |
5985 | ((Feptr == mb->end_subject && |
5986 | (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) || |
5987 | !WAS_NEWLINE(Feptr))) |
5988 | RRETURN(MATCH_NOMATCH); |
5989 | Fecode++; |
5990 | break; |
5991 | |
5992 | /* Assert before any newline, or before end of subject unless noteol is |
5993 | set. */ |
5994 | |
5995 | case OP_DOLLM: |
5996 | if (Feptr < mb->end_subject) |
5997 | { |
5998 | if (!IS_NEWLINE(Feptr)) |
5999 | { |
6000 | if (mb->partial != 0 && |
6001 | Feptr + 1 >= mb->end_subject && |
6002 | NLBLOCK->nltype == NLTYPE_FIXED && |
6003 | NLBLOCK->nllen == 2 && |
6004 | UCHAR21TEST(Feptr) == NLBLOCK->nl[0]) |
6005 | { |
6006 | mb->hitend = TRUE; |
6007 | if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; |
6008 | } |
6009 | RRETURN(MATCH_NOMATCH); |
6010 | } |
6011 | } |
6012 | else |
6013 | { |
6014 | if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH); |
6015 | SCHECK_PARTIAL(); |
6016 | } |
6017 | Fecode++; |
6018 | break; |
6019 | |
6020 | |
6021 | /* ===================================================================== */ |
6022 | /* Start of match assertion */ |
6023 | |
6024 | case OP_SOM: |
6025 | if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH); |
6026 | Fecode++; |
6027 | break; |
6028 | |
6029 | |
6030 | /* ===================================================================== */ |
6031 | /* Reset the start of match point */ |
6032 | |
6033 | case OP_SET_SOM: |
6034 | Fstart_match = Feptr; |
6035 | Fecode++; |
6036 | break; |
6037 | |
6038 | |
6039 | /* ===================================================================== */ |
6040 | /* Word boundary assertions. Find out if the previous and current |
6041 | characters are "word" characters. It takes a bit more work in UTF mode. |
6042 | Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is |
6043 | not set. When it is set, use Unicode properties if available, even when not |
6044 | in UTF mode. Remember the earliest and latest consulted characters. */ |
6045 | |
6046 | case OP_NOT_WORD_BOUNDARY: |
6047 | case OP_WORD_BOUNDARY: |
6048 | if (Feptr == mb->check_subject) prev_is_word = FALSE; else |
6049 | { |
6050 | PCRE2_SPTR lastptr = Feptr - 1; |
6051 | #ifdef SUPPORT_UNICODE |
6052 | if (utf) |
6053 | { |
6054 | BACKCHAR(lastptr); |
6055 | GETCHAR(fc, lastptr); |
6056 | } |
6057 | else |
6058 | #endif /* SUPPORT_UNICODE */ |
6059 | fc = *lastptr; |
6060 | if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr; |
6061 | #ifdef SUPPORT_UNICODE |
6062 | if ((mb->poptions & PCRE2_UCP) != 0) |
6063 | { |
6064 | if (fc == '_') prev_is_word = TRUE; else |
6065 | { |
6066 | int cat = UCD_CATEGORY(fc); |
6067 | prev_is_word = (cat == ucp_L || cat == ucp_N); |
6068 | } |
6069 | } |
6070 | else |
6071 | #endif /* SUPPORT_UNICODE */ |
6072 | prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0; |
6073 | } |
6074 | |
6075 | /* Get status of next character */ |
6076 | |
6077 | if (Feptr >= mb->end_subject) |
6078 | { |
6079 | SCHECK_PARTIAL(); |
6080 | cur_is_word = FALSE; |
6081 | } |
6082 | else |
6083 | { |
6084 | PCRE2_SPTR nextptr = Feptr + 1; |
6085 | #ifdef SUPPORT_UNICODE |
6086 | if (utf) |
6087 | { |
6088 | FORWARDCHARTEST(nextptr, mb->end_subject); |
6089 | GETCHAR(fc, Feptr); |
6090 | } |
6091 | else |
6092 | #endif /* SUPPORT_UNICODE */ |
6093 | fc = *Feptr; |
6094 | if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr; |
6095 | #ifdef SUPPORT_UNICODE |
6096 | if ((mb->poptions & PCRE2_UCP) != 0) |
6097 | { |
6098 | if (fc == '_') cur_is_word = TRUE; else |
6099 | { |
6100 | int cat = UCD_CATEGORY(fc); |
6101 | cur_is_word = (cat == ucp_L || cat == ucp_N); |
6102 | } |
6103 | } |
6104 | else |
6105 | #endif /* SUPPORT_UNICODE */ |
6106 | cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0; |
6107 | } |
6108 | |
6109 | /* Now see if the situation is what we want */ |
6110 | |
6111 | if ((*Fecode++ == OP_WORD_BOUNDARY)? |
6112 | cur_is_word == prev_is_word : cur_is_word != prev_is_word) |
6113 | RRETURN(MATCH_NOMATCH); |
6114 | break; |
6115 | |
6116 | |
6117 | /* ===================================================================== */ |
6118 | /* Backtracking (*VERB)s, with and without arguments. Note that if the |
6119 | pattern is successfully matched, we do not come back from RMATCH. */ |
6120 | |
6121 | case OP_MARK: |
6122 | Fmark = mb->nomatch_mark = Fecode + 2; |
6123 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12); |
6124 | |
6125 | /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an |
6126 | argument, and we must check whether that argument matches this MARK's |
6127 | argument. It is passed back in mb->verb_skip_ptr. If it does match, we |
6128 | return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject |
6129 | position that corresponds to this mark. Otherwise, pass back the return |
6130 | code unaltered. */ |
6131 | |
6132 | if (rrc == MATCH_SKIP_ARG && |
6133 | PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0) |
6134 | { |
6135 | mb->verb_skip_ptr = Feptr; /* Pass back current position */ |
6136 | RRETURN(MATCH_SKIP); |
6137 | } |
6138 | RRETURN(rrc); |
6139 | |
6140 | case OP_FAIL: |
6141 | RRETURN(MATCH_NOMATCH); |
6142 | |
6143 | /* Record the current recursing group number in mb->verb_current_recurse |
6144 | when a backtracking return such as MATCH_COMMIT is given. This enables the |
6145 | recurse processing to catch verbs from within the recursion. */ |
6146 | |
6147 | case OP_COMMIT: |
6148 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13); |
6149 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6150 | mb->verb_current_recurse = Fcurrent_recurse; |
6151 | RRETURN(MATCH_COMMIT); |
6152 | |
6153 | case OP_COMMIT_ARG: |
6154 | Fmark = mb->nomatch_mark = Fecode + 2; |
6155 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36); |
6156 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6157 | mb->verb_current_recurse = Fcurrent_recurse; |
6158 | RRETURN(MATCH_COMMIT); |
6159 | |
6160 | case OP_PRUNE: |
6161 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14); |
6162 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6163 | mb->verb_current_recurse = Fcurrent_recurse; |
6164 | RRETURN(MATCH_PRUNE); |
6165 | |
6166 | case OP_PRUNE_ARG: |
6167 | Fmark = mb->nomatch_mark = Fecode + 2; |
6168 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15); |
6169 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6170 | mb->verb_current_recurse = Fcurrent_recurse; |
6171 | RRETURN(MATCH_PRUNE); |
6172 | |
6173 | case OP_SKIP: |
6174 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16); |
6175 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6176 | mb->verb_skip_ptr = Feptr; /* Pass back current position */ |
6177 | mb->verb_current_recurse = Fcurrent_recurse; |
6178 | RRETURN(MATCH_SKIP); |
6179 | |
6180 | /* Note that, for Perl compatibility, SKIP with an argument does NOT set |
6181 | nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was |
6182 | not a matching mark, we have to re-run the match, ignoring the SKIP_ARG |
6183 | that failed and any that precede it (either they also failed, or were not |
6184 | triggered). To do this, we maintain a count of executed SKIP_ARGs. If a |
6185 | SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg |
6186 | set to the count of the one that failed. */ |
6187 | |
6188 | case OP_SKIP_ARG: |
6189 | mb->skip_arg_count++; |
6190 | if (mb->skip_arg_count <= mb->ignore_skip_arg) |
6191 | { |
6192 | Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1]; |
6193 | break; |
6194 | } |
6195 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17); |
6196 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6197 | |
6198 | /* Pass back the current skip name and return the special MATCH_SKIP_ARG |
6199 | return code. This will either be caught by a matching MARK, or get to the |
6200 | top, where it causes a rematch with mb->ignore_skip_arg set to the value of |
6201 | mb->skip_arg_count. */ |
6202 | |
6203 | mb->verb_skip_ptr = Fecode + 2; |
6204 | mb->verb_current_recurse = Fcurrent_recurse; |
6205 | RRETURN(MATCH_SKIP_ARG); |
6206 | |
6207 | /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that |
6208 | the branch in which it occurs can be determined. */ |
6209 | |
6210 | case OP_THEN: |
6211 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18); |
6212 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6213 | mb->verb_ecode_ptr = Fecode; |
6214 | mb->verb_current_recurse = Fcurrent_recurse; |
6215 | RRETURN(MATCH_THEN); |
6216 | |
6217 | case OP_THEN_ARG: |
6218 | Fmark = mb->nomatch_mark = Fecode + 2; |
6219 | RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19); |
6220 | if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
6221 | mb->verb_ecode_ptr = Fecode; |
6222 | mb->verb_current_recurse = Fcurrent_recurse; |
6223 | RRETURN(MATCH_THEN); |
6224 | |
6225 | |
6226 | /* ===================================================================== */ |
6227 | /* There's been some horrible disaster. Arrival here can only mean there is |
6228 | something seriously wrong in the code above or the OP_xxx definitions. */ |
6229 | |
6230 | default: |
6231 | return PCRE2_ERROR_INTERNAL; |
6232 | } |
6233 | |
6234 | /* Do not insert any code in here without much thought; it is assumed |
6235 | that "continue" in the code above comes out to here to repeat the main |
6236 | loop. */ |
6237 | |
6238 | } /* End of main loop */ |
6239 | /* Control never reaches here */ |
6240 | |
6241 | |
6242 | /* ========================================================================= */ |
6243 | /* The RRETURN() macro jumps here. The number that is saved in Freturn_id |
6244 | indicates which label we actually want to return to. The value in Frdepth is |
6245 | the index number of the frame in the vector. The return value has been placed |
6246 | in rrc. */ |
6247 | |
6248 | #define LBL(val) case val: goto L_RM##val; |
6249 | |
6250 | RETURN_SWITCH: |
6251 | if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; |
6252 | if (Frdepth == 0) return rrc; /* Exit from the top level */ |
6253 | F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */ |
6254 | mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */ |
6255 | |
6256 | #ifdef DEBUG_SHOW_RMATCH |
6257 | fprintf(stderr, "++ RETURN %d to %d\n" , rrc, Freturn_id); |
6258 | #endif |
6259 | |
6260 | switch (Freturn_id) |
6261 | { |
6262 | LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) |
6263 | LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16) |
6264 | LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24) |
6265 | LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32) |
6266 | LBL(33) LBL(34) LBL(35) LBL(36) |
6267 | |
6268 | #ifdef SUPPORT_WIDE_CHARS |
6269 | LBL(100) LBL(101) |
6270 | #endif |
6271 | |
6272 | #ifdef SUPPORT_UNICODE |
6273 | LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206) |
6274 | LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213) |
6275 | LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220) |
6276 | LBL(221) LBL(222) LBL(223) LBL(224) LBL(225) |
6277 | #endif |
6278 | |
6279 | default: |
6280 | return PCRE2_ERROR_INTERNAL; |
6281 | } |
6282 | #undef LBL |
6283 | } |
6284 | |
6285 | |
6286 | /************************************************* |
6287 | * Match a Regular Expression * |
6288 | *************************************************/ |
6289 | |
6290 | /* This function applies a compiled pattern to a subject string and picks out |
6291 | portions of the string if it matches. Two elements in the vector are set for |
6292 | each substring: the offsets to the start and end of the substring. |
6293 | |
6294 | Arguments: |
6295 | code points to the compiled expression |
6296 | subject points to the subject string |
6297 | length length of subject string (may contain binary zeros) |
6298 | start_offset where to start in the subject string |
6299 | options option bits |
6300 | match_data points to a match_data block |
6301 | mcontext points a PCRE2 context |
6302 | |
6303 | Returns: > 0 => success; value is the number of ovector pairs filled |
6304 | = 0 => success, but ovector is not big enough |
6305 | = -1 => failed to match (PCRE2_ERROR_NOMATCH) |
6306 | = -2 => partial match (PCRE2_ERROR_PARTIAL) |
6307 | < -2 => some kind of unexpected problem |
6308 | */ |
6309 | |
6310 | PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION |
6311 | pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, |
6312 | PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, |
6313 | pcre2_match_context *mcontext) |
6314 | { |
6315 | int rc; |
6316 | int was_zero_terminated = 0; |
6317 | const uint8_t *start_bits = NULL; |
6318 | const pcre2_real_code *re = (const pcre2_real_code *)code; |
6319 | |
6320 | BOOL anchored; |
6321 | BOOL firstline; |
6322 | BOOL has_first_cu = FALSE; |
6323 | BOOL has_req_cu = FALSE; |
6324 | BOOL startline; |
6325 | |
6326 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
6327 | PCRE2_SPTR memchr_found_first_cu; |
6328 | PCRE2_SPTR memchr_found_first_cu2; |
6329 | #endif |
6330 | |
6331 | PCRE2_UCHAR first_cu = 0; |
6332 | PCRE2_UCHAR first_cu2 = 0; |
6333 | PCRE2_UCHAR req_cu = 0; |
6334 | PCRE2_UCHAR req_cu2 = 0; |
6335 | |
6336 | PCRE2_SPTR bumpalong_limit; |
6337 | PCRE2_SPTR end_subject; |
6338 | PCRE2_SPTR true_end_subject; |
6339 | PCRE2_SPTR start_match; |
6340 | PCRE2_SPTR req_cu_ptr; |
6341 | PCRE2_SPTR start_partial; |
6342 | PCRE2_SPTR match_partial; |
6343 | |
6344 | #ifdef SUPPORT_JIT |
6345 | BOOL use_jit; |
6346 | #endif |
6347 | |
6348 | /* This flag is needed even when Unicode is not supported for convenience |
6349 | (it is used by the IS_NEWLINE macro). */ |
6350 | |
6351 | BOOL utf = FALSE; |
6352 | |
6353 | #ifdef SUPPORT_UNICODE |
6354 | BOOL ucp = FALSE; |
6355 | BOOL allow_invalid; |
6356 | uint32_t fragment_options = 0; |
6357 | #ifdef SUPPORT_JIT |
6358 | BOOL jit_checked_utf = FALSE; |
6359 | #endif |
6360 | #endif /* SUPPORT_UNICODE */ |
6361 | |
6362 | PCRE2_SIZE frame_size; |
6363 | PCRE2_SIZE heapframes_size; |
6364 | |
6365 | /* We need to have mb as a pointer to a match block, because the IS_NEWLINE |
6366 | macro is used below, and it expects NLBLOCK to be defined as a pointer. */ |
6367 | |
6368 | pcre2_callout_block cb; |
6369 | match_block actual_match_block; |
6370 | match_block *mb = &actual_match_block; |
6371 | |
6372 | /* Recognize NULL, length 0 as an empty string. */ |
6373 | |
6374 | if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"" ; |
6375 | |
6376 | /* Plausibility checks */ |
6377 | |
6378 | if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION; |
6379 | if (code == NULL || subject == NULL || match_data == NULL) |
6380 | return PCRE2_ERROR_NULL; |
6381 | |
6382 | start_match = subject + start_offset; |
6383 | req_cu_ptr = start_match - 1; |
6384 | if (length == PCRE2_ZERO_TERMINATED) |
6385 | { |
6386 | length = PRIV(strlen)(subject); |
6387 | was_zero_terminated = 1; |
6388 | } |
6389 | true_end_subject = end_subject = subject + length; |
6390 | |
6391 | if (start_offset > length) return PCRE2_ERROR_BADOFFSET; |
6392 | |
6393 | /* Check that the first field in the block is the magic number. */ |
6394 | |
6395 | if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC; |
6396 | |
6397 | /* Check the code unit width. */ |
6398 | |
6399 | if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8) |
6400 | return PCRE2_ERROR_BADMODE; |
6401 | |
6402 | /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the |
6403 | options variable for this function. Users of PCRE2 who are not calling the |
6404 | function directly would like to have a way of setting these flags, in the same |
6405 | way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with |
6406 | constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and |
6407 | (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now |
6408 | transfer to the options for this function. The bits are guaranteed to be |
6409 | adjacent, but do not have the same values. This bit of Boolean trickery assumes |
6410 | that the match-time bits are not more significant than the flag bits. If by |
6411 | accident this is not the case, a compile-time division by zero error will |
6412 | occur. */ |
6413 | |
6414 | #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET) |
6415 | #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART) |
6416 | options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1))); |
6417 | #undef FF |
6418 | #undef OO |
6419 | |
6420 | /* If the pattern was successfully studied with JIT support, we will run the |
6421 | JIT executable instead of the rest of this function. Most options must be set |
6422 | at compile time for the JIT code to be usable. */ |
6423 | |
6424 | #ifdef SUPPORT_JIT |
6425 | use_jit = (re->executable_jit != NULL && |
6426 | (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0); |
6427 | #endif |
6428 | |
6429 | /* Initialize UTF/UCP parameters. */ |
6430 | |
6431 | #ifdef SUPPORT_UNICODE |
6432 | utf = (re->overall_options & PCRE2_UTF) != 0; |
6433 | allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0; |
6434 | ucp = (re->overall_options & PCRE2_UCP) != 0; |
6435 | #endif /* SUPPORT_UNICODE */ |
6436 | |
6437 | /* Convert the partial matching flags into an integer. */ |
6438 | |
6439 | mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 : |
6440 | ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0; |
6441 | |
6442 | /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same |
6443 | time. */ |
6444 | |
6445 | if (mb->partial != 0 && |
6446 | ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0) |
6447 | return PCRE2_ERROR_BADOPTION; |
6448 | |
6449 | /* It is an error to set an offset limit without setting the flag at compile |
6450 | time. */ |
6451 | |
6452 | if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET && |
6453 | (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) |
6454 | return PCRE2_ERROR_BADOFFSETLIMIT; |
6455 | |
6456 | /* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT, |
6457 | free the memory that was obtained. Set the field to NULL for no match cases. */ |
6458 | |
6459 | if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0) |
6460 | { |
6461 | match_data->memctl.free((void *)match_data->subject, |
6462 | match_data->memctl.memory_data); |
6463 | match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT; |
6464 | } |
6465 | match_data->subject = NULL; |
6466 | |
6467 | /* Zero the error offset in case the first code unit is invalid UTF. */ |
6468 | |
6469 | match_data->startchar = 0; |
6470 | |
6471 | |
6472 | /* ============================= JIT matching ============================== */ |
6473 | |
6474 | /* Prepare for JIT matching. Check a UTF string for validity unless no check is |
6475 | requested or invalid UTF can be handled. We check only the portion of the |
6476 | subject that might be be inspected during matching - from the offset minus the |
6477 | maximum lookbehind to the given length. This saves time when a small part of a |
6478 | large subject is being matched by the use of a starting offset. Note that the |
6479 | maximum lookbehind is a number of characters, not code units. */ |
6480 | |
6481 | #ifdef SUPPORT_JIT |
6482 | if (use_jit) |
6483 | { |
6484 | #ifdef SUPPORT_UNICODE |
6485 | if (utf && (options & PCRE2_NO_UTF_CHECK) == 0 && !allow_invalid) |
6486 | { |
6487 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
6488 | unsigned int i; |
6489 | #endif |
6490 | |
6491 | /* For 8-bit and 16-bit UTF, check that the first code unit is a valid |
6492 | character start. */ |
6493 | |
6494 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
6495 | if (start_match < end_subject && NOT_FIRSTCU(*start_match)) |
6496 | { |
6497 | if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET; |
6498 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
6499 | return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */ |
6500 | #else |
6501 | return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */ |
6502 | #endif |
6503 | } |
6504 | #endif /* WIDTH != 32 */ |
6505 | |
6506 | /* Move back by the maximum lookbehind, just in case it happens at the very |
6507 | start of matching. */ |
6508 | |
6509 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
6510 | for (i = re->max_lookbehind; i > 0 && start_match > subject; i--) |
6511 | { |
6512 | start_match--; |
6513 | while (start_match > subject && |
6514 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
6515 | (*start_match & 0xc0) == 0x80) |
6516 | #else /* 16-bit */ |
6517 | (*start_match & 0xfc00) == 0xdc00) |
6518 | #endif |
6519 | start_match--; |
6520 | } |
6521 | #else /* PCRE2_CODE_UNIT_WIDTH != 32 */ |
6522 | |
6523 | /* In the 32-bit library, one code unit equals one character. However, |
6524 | we cannot just subtract the lookbehind and then compare pointers, because |
6525 | a very large lookbehind could create an invalid pointer. */ |
6526 | |
6527 | if (start_offset >= re->max_lookbehind) |
6528 | start_match -= re->max_lookbehind; |
6529 | else |
6530 | start_match = subject; |
6531 | #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ |
6532 | |
6533 | /* Validate the relevant portion of the subject. Adjust the offset of an |
6534 | invalid code point to be an absolute offset in the whole string. */ |
6535 | |
6536 | match_data->rc = PRIV(valid_utf)(start_match, |
6537 | length - (start_match - subject), &(match_data->startchar)); |
6538 | if (match_data->rc != 0) |
6539 | { |
6540 | match_data->startchar += start_match - subject; |
6541 | return match_data->rc; |
6542 | } |
6543 | jit_checked_utf = TRUE; |
6544 | } |
6545 | #endif /* SUPPORT_UNICODE */ |
6546 | |
6547 | /* If JIT returns BADOPTION, which means that the selected complete or |
6548 | partial matching mode was not compiled, fall through to the interpreter. */ |
6549 | |
6550 | rc = pcre2_jit_match(code, subject, length, start_offset, options, |
6551 | match_data, mcontext); |
6552 | if (rc != PCRE2_ERROR_JIT_BADOPTION) |
6553 | { |
6554 | if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0) |
6555 | { |
6556 | length = CU2BYTES(length + was_zero_terminated); |
6557 | match_data->subject = match_data->memctl.malloc(length, |
6558 | match_data->memctl.memory_data); |
6559 | if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY; |
6560 | memcpy((void *)match_data->subject, subject, length); |
6561 | match_data->flags |= PCRE2_MD_COPIED_SUBJECT; |
6562 | } |
6563 | return rc; |
6564 | } |
6565 | } |
6566 | #endif /* SUPPORT_JIT */ |
6567 | |
6568 | /* ========================= End of JIT matching ========================== */ |
6569 | |
6570 | |
6571 | /* Proceed with non-JIT matching. The default is to allow lookbehinds to the |
6572 | start of the subject. A UTF check when there is a non-zero offset may change |
6573 | this. */ |
6574 | |
6575 | mb->check_subject = subject; |
6576 | |
6577 | /* If a UTF subject string was not checked for validity in the JIT code above, |
6578 | check it here, and handle support for invalid UTF strings. The check above |
6579 | happens only when invalid UTF is not supported and PCRE2_NO_CHECK_UTF is unset. |
6580 | If we get here in those circumstances, it means the subject string is valid, |
6581 | but for some reason JIT matching was not successful. There is no need to check |
6582 | the subject again. |
6583 | |
6584 | We check only the portion of the subject that might be be inspected during |
6585 | matching - from the offset minus the maximum lookbehind to the given length. |
6586 | This saves time when a small part of a large subject is being matched by the |
6587 | use of a starting offset. Note that the maximum lookbehind is a number of |
6588 | characters, not code units. |
6589 | |
6590 | Note also that support for invalid UTF forces a check, overriding the setting |
6591 | of PCRE2_NO_CHECK_UTF. */ |
6592 | |
6593 | #ifdef SUPPORT_UNICODE |
6594 | if (utf && |
6595 | #ifdef SUPPORT_JIT |
6596 | !jit_checked_utf && |
6597 | #endif |
6598 | ((options & PCRE2_NO_UTF_CHECK) == 0 || allow_invalid)) |
6599 | { |
6600 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
6601 | BOOL skipped_bad_start = FALSE; |
6602 | #endif |
6603 | |
6604 | /* For 8-bit and 16-bit UTF, check that the first code unit is a valid |
6605 | character start. If we are handling invalid UTF, just skip over such code |
6606 | units. Otherwise, give an appropriate error. */ |
6607 | |
6608 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
6609 | if (allow_invalid) |
6610 | { |
6611 | while (start_match < end_subject && NOT_FIRSTCU(*start_match)) |
6612 | { |
6613 | start_match++; |
6614 | skipped_bad_start = TRUE; |
6615 | } |
6616 | } |
6617 | else if (start_match < end_subject && NOT_FIRSTCU(*start_match)) |
6618 | { |
6619 | if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET; |
6620 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
6621 | return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */ |
6622 | #else |
6623 | return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */ |
6624 | #endif |
6625 | } |
6626 | #endif /* WIDTH != 32 */ |
6627 | |
6628 | /* The mb->check_subject field points to the start of UTF checking; |
6629 | lookbehinds can go back no further than this. */ |
6630 | |
6631 | mb->check_subject = start_match; |
6632 | |
6633 | /* Move back by the maximum lookbehind, just in case it happens at the very |
6634 | start of matching, but don't do this if we skipped bad 8-bit or 16-bit code |
6635 | units above. */ |
6636 | |
6637 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
6638 | if (!skipped_bad_start) |
6639 | { |
6640 | unsigned int i; |
6641 | for (i = re->max_lookbehind; i > 0 && mb->check_subject > subject; i--) |
6642 | { |
6643 | mb->check_subject--; |
6644 | while (mb->check_subject > subject && |
6645 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
6646 | (*mb->check_subject & 0xc0) == 0x80) |
6647 | #else /* 16-bit */ |
6648 | (*mb->check_subject & 0xfc00) == 0xdc00) |
6649 | #endif |
6650 | mb->check_subject--; |
6651 | } |
6652 | } |
6653 | #else /* PCRE2_CODE_UNIT_WIDTH != 32 */ |
6654 | |
6655 | /* In the 32-bit library, one code unit equals one character. However, |
6656 | we cannot just subtract the lookbehind and then compare pointers, because |
6657 | a very large lookbehind could create an invalid pointer. */ |
6658 | |
6659 | if (start_offset >= re->max_lookbehind) |
6660 | mb->check_subject -= re->max_lookbehind; |
6661 | else |
6662 | mb->check_subject = subject; |
6663 | #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ |
6664 | |
6665 | /* Validate the relevant portion of the subject. There's a loop in case we |
6666 | encounter bad UTF in the characters preceding start_match which we are |
6667 | scanning because of a lookbehind. */ |
6668 | |
6669 | for (;;) |
6670 | { |
6671 | match_data->rc = PRIV(valid_utf)(mb->check_subject, |
6672 | length - (mb->check_subject - subject), &(match_data->startchar)); |
6673 | |
6674 | if (match_data->rc == 0) break; /* Valid UTF string */ |
6675 | |
6676 | /* Invalid UTF string. Adjust the offset to be an absolute offset in the |
6677 | whole string. If we are handling invalid UTF strings, set end_subject to |
6678 | stop before the bad code unit, and set the options to "not end of line". |
6679 | Otherwise return the error. */ |
6680 | |
6681 | match_data->startchar += mb->check_subject - subject; |
6682 | if (!allow_invalid || match_data->rc > 0) return match_data->rc; |
6683 | end_subject = subject + match_data->startchar; |
6684 | |
6685 | /* If the end precedes start_match, it means there is invalid UTF in the |
6686 | extra code units we reversed over because of a lookbehind. Advance past the |
6687 | first bad code unit, and then skip invalid character starting code units in |
6688 | 8-bit and 16-bit modes, and try again with the original end point. */ |
6689 | |
6690 | if (end_subject < start_match) |
6691 | { |
6692 | mb->check_subject = end_subject + 1; |
6693 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
6694 | while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject)) |
6695 | mb->check_subject++; |
6696 | #endif |
6697 | end_subject = true_end_subject; |
6698 | } |
6699 | |
6700 | /* Otherwise, set the not end of line option, and do the match. */ |
6701 | |
6702 | else |
6703 | { |
6704 | fragment_options = PCRE2_NOTEOL; |
6705 | break; |
6706 | } |
6707 | } |
6708 | } |
6709 | #endif /* SUPPORT_UNICODE */ |
6710 | |
6711 | /* A NULL match context means "use a default context", but we take the memory |
6712 | control functions from the pattern. */ |
6713 | |
6714 | if (mcontext == NULL) |
6715 | { |
6716 | mcontext = (pcre2_match_context *)(&PRIV(default_match_context)); |
6717 | mb->memctl = re->memctl; |
6718 | } |
6719 | else mb->memctl = mcontext->memctl; |
6720 | |
6721 | anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0; |
6722 | firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0; |
6723 | startline = (re->flags & PCRE2_STARTLINE) != 0; |
6724 | bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)? |
6725 | true_end_subject : subject + mcontext->offset_limit; |
6726 | |
6727 | /* Initialize and set up the fixed fields in the callout block, with a pointer |
6728 | in the match block. */ |
6729 | |
6730 | mb->cb = &cb; |
6731 | cb.version = 2; |
6732 | cb.subject = subject; |
6733 | cb.subject_length = (PCRE2_SIZE)(end_subject - subject); |
6734 | cb.callout_flags = 0; |
6735 | |
6736 | /* Fill in the remaining fields in the match block, except for moptions, which |
6737 | gets set later. */ |
6738 | |
6739 | mb->callout = mcontext->callout; |
6740 | mb->callout_data = mcontext->callout_data; |
6741 | |
6742 | mb->start_subject = subject; |
6743 | mb->start_offset = start_offset; |
6744 | mb->end_subject = end_subject; |
6745 | mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0; |
6746 | mb->allowemptypartial = (re->max_lookbehind > 0) || |
6747 | (re->flags & PCRE2_MATCH_EMPTY) != 0; |
6748 | mb->poptions = re->overall_options; /* Pattern options */ |
6749 | mb->ignore_skip_arg = 0; |
6750 | mb->mark = mb->nomatch_mark = NULL; /* In case never set */ |
6751 | |
6752 | /* The name table is needed for finding all the numbers associated with a |
6753 | given name, for condition testing. The code follows the name table. */ |
6754 | |
6755 | mb->name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)); |
6756 | mb->name_count = re->name_count; |
6757 | mb->name_entry_size = re->name_entry_size; |
6758 | mb->start_code = mb->name_table + re->name_count * re->name_entry_size; |
6759 | |
6760 | /* Process the \R and newline settings. */ |
6761 | |
6762 | mb->bsr_convention = re->bsr_convention; |
6763 | mb->nltype = NLTYPE_FIXED; |
6764 | switch(re->newline_convention) |
6765 | { |
6766 | case PCRE2_NEWLINE_CR: |
6767 | mb->nllen = 1; |
6768 | mb->nl[0] = CHAR_CR; |
6769 | break; |
6770 | |
6771 | case PCRE2_NEWLINE_LF: |
6772 | mb->nllen = 1; |
6773 | mb->nl[0] = CHAR_NL; |
6774 | break; |
6775 | |
6776 | case PCRE2_NEWLINE_NUL: |
6777 | mb->nllen = 1; |
6778 | mb->nl[0] = CHAR_NUL; |
6779 | break; |
6780 | |
6781 | case PCRE2_NEWLINE_CRLF: |
6782 | mb->nllen = 2; |
6783 | mb->nl[0] = CHAR_CR; |
6784 | mb->nl[1] = CHAR_NL; |
6785 | break; |
6786 | |
6787 | case PCRE2_NEWLINE_ANY: |
6788 | mb->nltype = NLTYPE_ANY; |
6789 | break; |
6790 | |
6791 | case PCRE2_NEWLINE_ANYCRLF: |
6792 | mb->nltype = NLTYPE_ANYCRLF; |
6793 | break; |
6794 | |
6795 | default: return PCRE2_ERROR_INTERNAL; |
6796 | } |
6797 | |
6798 | /* The backtracking frames have fixed data at the front, and a PCRE2_SIZE |
6799 | vector at the end, whose size depends on the number of capturing parentheses in |
6800 | the pattern. It is not used at all if there are no capturing parentheses. |
6801 | |
6802 | frame_size is the total size of each frame |
6803 | match_data->heapframes is the pointer to the frames vector |
6804 | match_data->heapframes_size is the total size of the vector |
6805 | |
6806 | We must pad the frame_size for alignment to ensure subsequent frames are as |
6807 | aligned as heapframe. Whilst ovector is word-aligned due to being a PCRE2_SIZE |
6808 | array, that does not guarantee it is suitably aligned for pointers, as some |
6809 | architectures have pointers that are larger than a size_t. */ |
6810 | |
6811 | frame_size = (offsetof(heapframe, ovector) + |
6812 | re->top_bracket * 2 * sizeof(PCRE2_SIZE) + HEAPFRAME_ALIGNMENT - 1) & |
6813 | ~(HEAPFRAME_ALIGNMENT - 1); |
6814 | |
6815 | /* Limits set in the pattern override the match context only if they are |
6816 | smaller. */ |
6817 | |
6818 | mb->heap_limit = ((mcontext->heap_limit < re->limit_heap)? |
6819 | mcontext->heap_limit : re->limit_heap) * 1024; |
6820 | |
6821 | mb->match_limit = (mcontext->match_limit < re->limit_match)? |
6822 | mcontext->match_limit : re->limit_match; |
6823 | |
6824 | mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)? |
6825 | mcontext->depth_limit : re->limit_depth; |
6826 | |
6827 | /* If a pattern has very many capturing parentheses, the frame size may be very |
6828 | large. Set the initial frame vector size to ensure that there are at least 10 |
6829 | available frames, but enforce a minimum of START_FRAMES_SIZE. If this is |
6830 | greater than the heap limit, get as large a vector as possible. Always round |
6831 | the size to a multiple of the frame size. */ |
6832 | |
6833 | heapframes_size = frame_size * 10; |
6834 | if (heapframes_size < START_FRAMES_SIZE) heapframes_size = START_FRAMES_SIZE; |
6835 | if (heapframes_size > mb->heap_limit) |
6836 | { |
6837 | if (frame_size > mb->heap_limit ) return PCRE2_ERROR_HEAPLIMIT; |
6838 | heapframes_size = mb->heap_limit; |
6839 | } |
6840 | |
6841 | /* If an existing frame vector in the match_data block is large enough, we can |
6842 | use it.Otherwise, free any pre-existing vector and get a new one. */ |
6843 | |
6844 | if (match_data->heapframes_size < heapframes_size) |
6845 | { |
6846 | match_data->memctl.free(match_data->heapframes, |
6847 | match_data->memctl.memory_data); |
6848 | match_data->heapframes = match_data->memctl.malloc(heapframes_size, |
6849 | match_data->memctl.memory_data); |
6850 | if (match_data->heapframes == NULL) |
6851 | { |
6852 | match_data->heapframes_size = 0; |
6853 | return PCRE2_ERROR_NOMEMORY; |
6854 | } |
6855 | match_data->heapframes_size = heapframes_size; |
6856 | } |
6857 | |
6858 | /* Write to the ovector within the first frame to mark every capture unset and |
6859 | to avoid uninitialized memory read errors when it is copied to a new frame. */ |
6860 | |
6861 | memset((char *)(match_data->heapframes) + offsetof(heapframe, ovector), 0xff, |
6862 | frame_size - offsetof(heapframe, ovector)); |
6863 | |
6864 | /* Pointers to the individual character tables */ |
6865 | |
6866 | mb->lcc = re->tables + lcc_offset; |
6867 | mb->fcc = re->tables + fcc_offset; |
6868 | mb->ctypes = re->tables + ctypes_offset; |
6869 | |
6870 | /* Set up the first code unit to match, if available. If there's no first code |
6871 | unit there may be a bitmap of possible first characters. */ |
6872 | |
6873 | if ((re->flags & PCRE2_FIRSTSET) != 0) |
6874 | { |
6875 | has_first_cu = TRUE; |
6876 | first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit); |
6877 | if ((re->flags & PCRE2_FIRSTCASELESS) != 0) |
6878 | { |
6879 | first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu); |
6880 | #ifdef SUPPORT_UNICODE |
6881 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
6882 | if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu); |
6883 | #else |
6884 | if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu); |
6885 | #endif |
6886 | #endif /* SUPPORT_UNICODE */ |
6887 | } |
6888 | } |
6889 | else |
6890 | if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0) |
6891 | start_bits = re->start_bitmap; |
6892 | |
6893 | /* There may also be a "last known required character" set. */ |
6894 | |
6895 | if ((re->flags & PCRE2_LASTSET) != 0) |
6896 | { |
6897 | has_req_cu = TRUE; |
6898 | req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit); |
6899 | if ((re->flags & PCRE2_LASTCASELESS) != 0) |
6900 | { |
6901 | req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu); |
6902 | #ifdef SUPPORT_UNICODE |
6903 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
6904 | if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu); |
6905 | #else |
6906 | if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu); |
6907 | #endif |
6908 | #endif /* SUPPORT_UNICODE */ |
6909 | } |
6910 | } |
6911 | |
6912 | |
6913 | /* ==========================================================================*/ |
6914 | |
6915 | /* Loop for handling unanchored repeated matching attempts; for anchored regexs |
6916 | the loop runs just once. */ |
6917 | |
6918 | #ifdef SUPPORT_UNICODE |
6919 | FRAGMENT_RESTART: |
6920 | #endif |
6921 | |
6922 | start_partial = match_partial = NULL; |
6923 | mb->hitend = FALSE; |
6924 | |
6925 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
6926 | memchr_found_first_cu = NULL; |
6927 | memchr_found_first_cu2 = NULL; |
6928 | #endif |
6929 | |
6930 | for(;;) |
6931 | { |
6932 | PCRE2_SPTR new_start_match; |
6933 | |
6934 | /* ----------------- Start of match optimizations ---------------- */ |
6935 | |
6936 | /* There are some optimizations that avoid running the match if a known |
6937 | starting point is not found, or if a known later code unit is not present. |
6938 | However, there is an option (settable at compile time) that disables these, |
6939 | for testing and for ensuring that all callouts do actually occur. */ |
6940 | |
6941 | if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) |
6942 | { |
6943 | /* If firstline is TRUE, the start of the match is constrained to the first |
6944 | line of a multiline string. That is, the match must be before or at the |
6945 | first newline following the start of matching. Temporarily adjust |
6946 | end_subject so that we stop the scans for a first code unit at a newline. |
6947 | If the match fails at the newline, later code breaks the loop. */ |
6948 | |
6949 | if (firstline) |
6950 | { |
6951 | PCRE2_SPTR t = start_match; |
6952 | #ifdef SUPPORT_UNICODE |
6953 | if (utf) |
6954 | { |
6955 | while (t < end_subject && !IS_NEWLINE(t)) |
6956 | { |
6957 | t++; |
6958 | ACROSSCHAR(t < end_subject, t, t++); |
6959 | } |
6960 | } |
6961 | else |
6962 | #endif |
6963 | while (t < end_subject && !IS_NEWLINE(t)) t++; |
6964 | end_subject = t; |
6965 | } |
6966 | |
6967 | /* Anchored: check the first code unit if one is recorded. This may seem |
6968 | pointless but it can help in detecting a no match case without scanning for |
6969 | the required code unit. */ |
6970 | |
6971 | if (anchored) |
6972 | { |
6973 | if (has_first_cu || start_bits != NULL) |
6974 | { |
6975 | BOOL ok = start_match < end_subject; |
6976 | if (ok) |
6977 | { |
6978 | PCRE2_UCHAR c = UCHAR21TEST(start_match); |
6979 | ok = has_first_cu && (c == first_cu || c == first_cu2); |
6980 | if (!ok && start_bits != NULL) |
6981 | { |
6982 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
6983 | if (c > 255) c = 255; |
6984 | #endif |
6985 | ok = (start_bits[c/8] & (1u << (c&7))) != 0; |
6986 | } |
6987 | } |
6988 | if (!ok) |
6989 | { |
6990 | rc = MATCH_NOMATCH; |
6991 | break; |
6992 | } |
6993 | } |
6994 | } |
6995 | |
6996 | /* Not anchored. Advance to a unique first code unit if there is one. */ |
6997 | |
6998 | else |
6999 | { |
7000 | if (has_first_cu) |
7001 | { |
7002 | if (first_cu != first_cu2) /* Caseless */ |
7003 | { |
7004 | /* In 16-bit and 32_bit modes we have to do our own search, so can |
7005 | look for both cases at once. */ |
7006 | |
7007 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7008 | PCRE2_UCHAR smc; |
7009 | while (start_match < end_subject && |
7010 | (smc = UCHAR21TEST(start_match)) != first_cu && |
7011 | smc != first_cu2) |
7012 | start_match++; |
7013 | #else |
7014 | /* In 8-bit mode, the use of memchr() gives a big speed up, even |
7015 | though we have to call it twice in order to find the earliest |
7016 | occurrence of the code unit in either of its cases. Caching is used |
7017 | to remember the positions of previously found code units. This can |
7018 | make a huge difference when the strings are very long and only one |
7019 | case is actually present. */ |
7020 | |
7021 | PCRE2_SPTR pp1 = NULL; |
7022 | PCRE2_SPTR pp2 = NULL; |
7023 | PCRE2_SIZE searchlength = end_subject - start_match; |
7024 | |
7025 | /* If we haven't got a previously found position for first_cu, or if |
7026 | the current starting position is later, we need to do a search. If |
7027 | the code unit is not found, set it to the end. */ |
7028 | |
7029 | if (memchr_found_first_cu == NULL || |
7030 | start_match > memchr_found_first_cu) |
7031 | { |
7032 | pp1 = memchr(start_match, first_cu, searchlength); |
7033 | memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1; |
7034 | } |
7035 | |
7036 | /* If the start is before a previously found position, use the |
7037 | previous position, or NULL if a previous search failed. */ |
7038 | |
7039 | else pp1 = (memchr_found_first_cu == end_subject)? NULL : |
7040 | memchr_found_first_cu; |
7041 | |
7042 | /* Do the same thing for the other case. */ |
7043 | |
7044 | if (memchr_found_first_cu2 == NULL || |
7045 | start_match > memchr_found_first_cu2) |
7046 | { |
7047 | pp2 = memchr(start_match, first_cu2, searchlength); |
7048 | memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2; |
7049 | } |
7050 | |
7051 | else pp2 = (memchr_found_first_cu2 == end_subject)? NULL : |
7052 | memchr_found_first_cu2; |
7053 | |
7054 | /* Set the start to the end of the subject if neither case was found. |
7055 | Otherwise, use the earlier found point. */ |
7056 | |
7057 | if (pp1 == NULL) |
7058 | start_match = (pp2 == NULL)? end_subject : pp2; |
7059 | else |
7060 | start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2; |
7061 | |
7062 | #endif /* 8-bit handling */ |
7063 | } |
7064 | |
7065 | /* The caseful case is much simpler. */ |
7066 | |
7067 | else |
7068 | { |
7069 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7070 | while (start_match < end_subject && UCHAR21TEST(start_match) != |
7071 | first_cu) |
7072 | start_match++; |
7073 | #else |
7074 | start_match = memchr(start_match, first_cu, end_subject - start_match); |
7075 | if (start_match == NULL) start_match = end_subject; |
7076 | #endif |
7077 | } |
7078 | |
7079 | /* If we can't find the required first code unit, having reached the |
7080 | true end of the subject, break the bumpalong loop, to force a match |
7081 | failure, except when doing partial matching, when we let the next cycle |
7082 | run at the end of the subject. To see why, consider the pattern |
7083 | /(?<=abc)def/, which partially matches "abc", even though the string |
7084 | does not contain the starting character "d". If we have not reached the |
7085 | true end of the subject (PCRE2_FIRSTLINE caused end_subject to be |
7086 | temporarily modified) we also let the cycle run, because the matching |
7087 | string is legitimately allowed to start with the first code unit of a |
7088 | newline. */ |
7089 | |
7090 | if (mb->partial == 0 && start_match >= mb->end_subject) |
7091 | { |
7092 | rc = MATCH_NOMATCH; |
7093 | break; |
7094 | } |
7095 | } |
7096 | |
7097 | /* If there's no first code unit, advance to just after a linebreak for a |
7098 | multiline match if required. */ |
7099 | |
7100 | else if (startline) |
7101 | { |
7102 | if (start_match > mb->start_subject + start_offset) |
7103 | { |
7104 | #ifdef SUPPORT_UNICODE |
7105 | if (utf) |
7106 | { |
7107 | while (start_match < end_subject && !WAS_NEWLINE(start_match)) |
7108 | { |
7109 | start_match++; |
7110 | ACROSSCHAR(start_match < end_subject, start_match, start_match++); |
7111 | } |
7112 | } |
7113 | else |
7114 | #endif |
7115 | while (start_match < end_subject && !WAS_NEWLINE(start_match)) |
7116 | start_match++; |
7117 | |
7118 | /* If we have just passed a CR and the newline option is ANY or |
7119 | ANYCRLF, and we are now at a LF, advance the match position by one |
7120 | more code unit. */ |
7121 | |
7122 | if (start_match[-1] == CHAR_CR && |
7123 | (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) && |
7124 | start_match < end_subject && |
7125 | UCHAR21TEST(start_match) == CHAR_NL) |
7126 | start_match++; |
7127 | } |
7128 | } |
7129 | |
7130 | /* If there's no first code unit or a requirement for a multiline line |
7131 | start, advance to a non-unique first code unit if any have been |
7132 | identified. The bitmap contains only 256 bits. When code units are 16 or |
7133 | 32 bits wide, all code units greater than 254 set the 255 bit. */ |
7134 | |
7135 | else if (start_bits != NULL) |
7136 | { |
7137 | while (start_match < end_subject) |
7138 | { |
7139 | uint32_t c = UCHAR21TEST(start_match); |
7140 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7141 | if (c > 255) c = 255; |
7142 | #endif |
7143 | if ((start_bits[c/8] & (1u << (c&7))) != 0) break; |
7144 | start_match++; |
7145 | } |
7146 | |
7147 | /* See comment above in first_cu checking about the next few lines. */ |
7148 | |
7149 | if (mb->partial == 0 && start_match >= mb->end_subject) |
7150 | { |
7151 | rc = MATCH_NOMATCH; |
7152 | break; |
7153 | } |
7154 | } |
7155 | } /* End first code unit handling */ |
7156 | |
7157 | /* Restore fudged end_subject */ |
7158 | |
7159 | end_subject = mb->end_subject; |
7160 | |
7161 | /* The following two optimizations must be disabled for partial matching. */ |
7162 | |
7163 | if (mb->partial == 0) |
7164 | { |
7165 | PCRE2_SPTR p; |
7166 | |
7167 | /* The minimum matching length is a lower bound; no string of that length |
7168 | may actually match the pattern. Although the value is, strictly, in |
7169 | characters, we treat it as code units to avoid spending too much time in |
7170 | this optimization. */ |
7171 | |
7172 | if (end_subject - start_match < re->minlength) |
7173 | { |
7174 | rc = MATCH_NOMATCH; |
7175 | break; |
7176 | } |
7177 | |
7178 | /* If req_cu is set, we know that that code unit must appear in the |
7179 | subject for the (non-partial) match to succeed. If the first code unit is |
7180 | set, req_cu must be later in the subject; otherwise the test starts at |
7181 | the match point. This optimization can save a huge amount of backtracking |
7182 | in patterns with nested unlimited repeats that aren't going to match. |
7183 | Writing separate code for caseful/caseless versions makes it go faster, |
7184 | as does using an autoincrement and backing off on a match. As in the case |
7185 | of the first code unit, using memchr() in the 8-bit library gives a big |
7186 | speed up. Unlike the first_cu check above, we do not need to call |
7187 | memchr() twice in the caseless case because we only need to check for the |
7188 | presence of the character in either case, not find the first occurrence. |
7189 | |
7190 | The search can be skipped if the code unit was found later than the |
7191 | current starting point in a previous iteration of the bumpalong loop. |
7192 | |
7193 | HOWEVER: when the subject string is very, very long, searching to its end |
7194 | can take a long time, and give bad performance on quite ordinary |
7195 | anchored patterns. This showed up when somebody was matching something |
7196 | like /^\d+C/ on a 32-megabyte string... so we don't do this when the |
7197 | string is sufficiently long, but it's worth searching a lot more for |
7198 | unanchored patterns. */ |
7199 | |
7200 | p = start_match + (has_first_cu? 1:0); |
7201 | if (has_req_cu && p > req_cu_ptr) |
7202 | { |
7203 | PCRE2_SIZE check_length = end_subject - start_match; |
7204 | |
7205 | if (check_length < REQ_CU_MAX || |
7206 | (!anchored && check_length < REQ_CU_MAX * 1000)) |
7207 | { |
7208 | if (req_cu != req_cu2) /* Caseless */ |
7209 | { |
7210 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7211 | while (p < end_subject) |
7212 | { |
7213 | uint32_t pp = UCHAR21INCTEST(p); |
7214 | if (pp == req_cu || pp == req_cu2) { p--; break; } |
7215 | } |
7216 | #else /* 8-bit code units */ |
7217 | PCRE2_SPTR pp = p; |
7218 | p = memchr(pp, req_cu, end_subject - pp); |
7219 | if (p == NULL) |
7220 | { |
7221 | p = memchr(pp, req_cu2, end_subject - pp); |
7222 | if (p == NULL) p = end_subject; |
7223 | } |
7224 | #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ |
7225 | } |
7226 | |
7227 | /* The caseful case */ |
7228 | |
7229 | else |
7230 | { |
7231 | #if PCRE2_CODE_UNIT_WIDTH != 8 |
7232 | while (p < end_subject) |
7233 | { |
7234 | if (UCHAR21INCTEST(p) == req_cu) { p--; break; } |
7235 | } |
7236 | |
7237 | #else /* 8-bit code units */ |
7238 | p = memchr(p, req_cu, end_subject - p); |
7239 | if (p == NULL) p = end_subject; |
7240 | #endif |
7241 | } |
7242 | |
7243 | /* If we can't find the required code unit, break the bumpalong loop, |
7244 | forcing a match failure. */ |
7245 | |
7246 | if (p >= end_subject) |
7247 | { |
7248 | rc = MATCH_NOMATCH; |
7249 | break; |
7250 | } |
7251 | |
7252 | /* If we have found the required code unit, save the point where we |
7253 | found it, so that we don't search again next time round the bumpalong |
7254 | loop if the start hasn't yet passed this code unit. */ |
7255 | |
7256 | req_cu_ptr = p; |
7257 | } |
7258 | } |
7259 | } |
7260 | } |
7261 | |
7262 | /* ------------ End of start of match optimizations ------------ */ |
7263 | |
7264 | /* Give no match if we have passed the bumpalong limit. */ |
7265 | |
7266 | if (start_match > bumpalong_limit) |
7267 | { |
7268 | rc = MATCH_NOMATCH; |
7269 | break; |
7270 | } |
7271 | |
7272 | /* OK, we can now run the match. If "hitend" is set afterwards, remember the |
7273 | first starting point for which a partial match was found. */ |
7274 | |
7275 | cb.start_match = (PCRE2_SIZE)(start_match - subject); |
7276 | cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH; |
7277 | |
7278 | mb->start_used_ptr = start_match; |
7279 | mb->last_used_ptr = start_match; |
7280 | #ifdef SUPPORT_UNICODE |
7281 | mb->moptions = options | fragment_options; |
7282 | #else |
7283 | mb->moptions = options; |
7284 | #endif |
7285 | mb->match_call_count = 0; |
7286 | mb->end_offset_top = 0; |
7287 | mb->skip_arg_count = 0; |
7288 | |
7289 | rc = match(start_match, mb->start_code, re->top_bracket, frame_size, |
7290 | match_data, mb); |
7291 | |
7292 | if (mb->hitend && start_partial == NULL) |
7293 | { |
7294 | start_partial = mb->start_used_ptr; |
7295 | match_partial = start_match; |
7296 | } |
7297 | |
7298 | switch(rc) |
7299 | { |
7300 | /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched |
7301 | the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP |
7302 | entirely. The only way we can do that is to re-do the match at the same |
7303 | point, with a flag to force SKIP with an argument to be ignored. Just |
7304 | treating this case as NOMATCH does not work because it does not check other |
7305 | alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */ |
7306 | |
7307 | case MATCH_SKIP_ARG: |
7308 | new_start_match = start_match; |
7309 | mb->ignore_skip_arg = mb->skip_arg_count; |
7310 | break; |
7311 | |
7312 | /* SKIP passes back the next starting point explicitly, but if it is no |
7313 | greater than the match we have just done, treat it as NOMATCH. */ |
7314 | |
7315 | case MATCH_SKIP: |
7316 | if (mb->verb_skip_ptr > start_match) |
7317 | { |
7318 | new_start_match = mb->verb_skip_ptr; |
7319 | break; |
7320 | } |
7321 | /* Fall through */ |
7322 | |
7323 | /* NOMATCH and PRUNE advance by one character. THEN at this level acts |
7324 | exactly like PRUNE. Unset ignore SKIP-with-argument. */ |
7325 | |
7326 | case MATCH_NOMATCH: |
7327 | case MATCH_PRUNE: |
7328 | case MATCH_THEN: |
7329 | mb->ignore_skip_arg = 0; |
7330 | new_start_match = start_match + 1; |
7331 | #ifdef SUPPORT_UNICODE |
7332 | if (utf) |
7333 | ACROSSCHAR(new_start_match < end_subject, new_start_match, |
7334 | new_start_match++); |
7335 | #endif |
7336 | break; |
7337 | |
7338 | /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ |
7339 | |
7340 | case MATCH_COMMIT: |
7341 | rc = MATCH_NOMATCH; |
7342 | goto ENDLOOP; |
7343 | |
7344 | /* Any other return is either a match, or some kind of error. */ |
7345 | |
7346 | default: |
7347 | goto ENDLOOP; |
7348 | } |
7349 | |
7350 | /* Control reaches here for the various types of "no match at this point" |
7351 | result. Reset the code to MATCH_NOMATCH for subsequent checking. */ |
7352 | |
7353 | rc = MATCH_NOMATCH; |
7354 | |
7355 | /* If PCRE2_FIRSTLINE is set, the match must happen before or at the first |
7356 | newline in the subject (though it may continue over the newline). Therefore, |
7357 | if we have just failed to match, starting at a newline, do not continue. */ |
7358 | |
7359 | if (firstline && IS_NEWLINE(start_match)) break; |
7360 | |
7361 | /* Advance to new matching position */ |
7362 | |
7363 | start_match = new_start_match; |
7364 | |
7365 | /* Break the loop if the pattern is anchored or if we have passed the end of |
7366 | the subject. */ |
7367 | |
7368 | if (anchored || start_match > end_subject) break; |
7369 | |
7370 | /* If we have just passed a CR and we are now at a LF, and the pattern does |
7371 | not contain any explicit matches for \r or \n, and the newline option is CRLF |
7372 | or ANY or ANYCRLF, advance the match position by one more code unit. In |
7373 | normal matching start_match will aways be greater than the first position at |
7374 | this stage, but a failed *SKIP can cause a return at the same point, which is |
7375 | why the first test exists. */ |
7376 | |
7377 | if (start_match > subject + start_offset && |
7378 | start_match[-1] == CHAR_CR && |
7379 | start_match < end_subject && |
7380 | *start_match == CHAR_NL && |
7381 | (re->flags & PCRE2_HASCRORLF) == 0 && |
7382 | (mb->nltype == NLTYPE_ANY || |
7383 | mb->nltype == NLTYPE_ANYCRLF || |
7384 | mb->nllen == 2)) |
7385 | start_match++; |
7386 | |
7387 | mb->mark = NULL; /* Reset for start of next match attempt */ |
7388 | } /* End of for(;;) "bumpalong" loop */ |
7389 | |
7390 | /* ==========================================================================*/ |
7391 | |
7392 | /* When we reach here, one of the following stopping conditions is true: |
7393 | |
7394 | (1) The match succeeded, either completely, or partially; |
7395 | |
7396 | (2) The pattern is anchored or the match was failed after (*COMMIT); |
7397 | |
7398 | (3) We are past the end of the subject or the bumpalong limit; |
7399 | |
7400 | (4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because |
7401 | this option requests that a match occur at or before the first newline in |
7402 | the subject. |
7403 | |
7404 | (5) Some kind of error occurred. |
7405 | |
7406 | */ |
7407 | |
7408 | ENDLOOP: |
7409 | |
7410 | /* If end_subject != true_end_subject, it means we are handling invalid UTF, |
7411 | and have just processed a non-terminal fragment. If this resulted in no match |
7412 | or a partial match we must carry on to the next fragment (a partial match is |
7413 | returned to the caller only at the very end of the subject). A loop is used to |
7414 | avoid trying to match against empty fragments; if the pattern can match an |
7415 | empty string it would have done so already. */ |
7416 | |
7417 | #ifdef SUPPORT_UNICODE |
7418 | if (utf && end_subject != true_end_subject && |
7419 | (rc == MATCH_NOMATCH || rc == PCRE2_ERROR_PARTIAL)) |
7420 | { |
7421 | for (;;) |
7422 | { |
7423 | /* Advance past the first bad code unit, and then skip invalid character |
7424 | starting code units in 8-bit and 16-bit modes. */ |
7425 | |
7426 | start_match = end_subject + 1; |
7427 | |
7428 | #if PCRE2_CODE_UNIT_WIDTH != 32 |
7429 | while (start_match < true_end_subject && NOT_FIRSTCU(*start_match)) |
7430 | start_match++; |
7431 | #endif |
7432 | |
7433 | /* If we have hit the end of the subject, there isn't another non-empty |
7434 | fragment, so give up. */ |
7435 | |
7436 | if (start_match >= true_end_subject) |
7437 | { |
7438 | rc = MATCH_NOMATCH; /* In case it was partial */ |
7439 | break; |
7440 | } |
7441 | |
7442 | /* Check the rest of the subject */ |
7443 | |
7444 | mb->check_subject = start_match; |
7445 | rc = PRIV(valid_utf)(start_match, length - (start_match - subject), |
7446 | &(match_data->startchar)); |
7447 | |
7448 | /* The rest of the subject is valid UTF. */ |
7449 | |
7450 | if (rc == 0) |
7451 | { |
7452 | mb->end_subject = end_subject = true_end_subject; |
7453 | fragment_options = PCRE2_NOTBOL; |
7454 | goto FRAGMENT_RESTART; |
7455 | } |
7456 | |
7457 | /* A subsequent UTF error has been found; if the next fragment is |
7458 | non-empty, set up to process it. Otherwise, let the loop advance. */ |
7459 | |
7460 | else if (rc < 0) |
7461 | { |
7462 | mb->end_subject = end_subject = start_match + match_data->startchar; |
7463 | if (end_subject > start_match) |
7464 | { |
7465 | fragment_options = PCRE2_NOTBOL|PCRE2_NOTEOL; |
7466 | goto FRAGMENT_RESTART; |
7467 | } |
7468 | } |
7469 | } |
7470 | } |
7471 | #endif /* SUPPORT_UNICODE */ |
7472 | |
7473 | /* Fill in fields that are always returned in the match data. */ |
7474 | |
7475 | match_data->code = re; |
7476 | match_data->mark = mb->mark; |
7477 | match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER; |
7478 | |
7479 | /* Handle a fully successful match. Set the return code to the number of |
7480 | captured strings, or 0 if there were too many to fit into the ovector, and then |
7481 | set the remaining returned values before returning. Make a copy of the subject |
7482 | string if requested. */ |
7483 | |
7484 | if (rc == MATCH_MATCH) |
7485 | { |
7486 | match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)? |
7487 | 0 : (int)mb->end_offset_top/2 + 1; |
7488 | match_data->startchar = start_match - subject; |
7489 | match_data->leftchar = mb->start_used_ptr - subject; |
7490 | match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)? |
7491 | mb->last_used_ptr : mb->end_match_ptr) - subject; |
7492 | if ((options & PCRE2_COPY_MATCHED_SUBJECT) != 0) |
7493 | { |
7494 | length = CU2BYTES(length + was_zero_terminated); |
7495 | match_data->subject = match_data->memctl.malloc(length, |
7496 | match_data->memctl.memory_data); |
7497 | if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY; |
7498 | memcpy((void *)match_data->subject, subject, length); |
7499 | match_data->flags |= PCRE2_MD_COPIED_SUBJECT; |
7500 | } |
7501 | else match_data->subject = subject; |
7502 | return match_data->rc; |
7503 | } |
7504 | |
7505 | /* Control gets here if there has been a partial match, an error, or if the |
7506 | overall match attempt has failed at all permitted starting positions. Any mark |
7507 | data is in the nomatch_mark field. */ |
7508 | |
7509 | match_data->mark = mb->nomatch_mark; |
7510 | |
7511 | /* For anything other than nomatch or partial match, just return the code. */ |
7512 | |
7513 | if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc; |
7514 | |
7515 | /* Handle a partial match. If a "soft" partial match was requested, searching |
7516 | for a complete match will have continued, and the value of rc at this point |
7517 | will be MATCH_NOMATCH. For a "hard" partial match, it will already be |
7518 | PCRE2_ERROR_PARTIAL. */ |
7519 | |
7520 | else if (match_partial != NULL) |
7521 | { |
7522 | match_data->subject = subject; |
7523 | match_data->ovector[0] = match_partial - subject; |
7524 | match_data->ovector[1] = end_subject - subject; |
7525 | match_data->startchar = match_partial - subject; |
7526 | match_data->leftchar = start_partial - subject; |
7527 | match_data->rightchar = end_subject - subject; |
7528 | match_data->rc = PCRE2_ERROR_PARTIAL; |
7529 | } |
7530 | |
7531 | /* Else this is the classic nomatch case. */ |
7532 | |
7533 | else match_data->rc = PCRE2_ERROR_NOMATCH; |
7534 | |
7535 | return match_data->rc; |
7536 | } |
7537 | |
7538 | /* These #undefs are here to enable unity builds with CMake. */ |
7539 | |
7540 | #undef NLBLOCK /* Block containing newline information */ |
7541 | #undef PSSTART /* Field containing processed string start */ |
7542 | #undef PSEND /* Field containing processed string end */ |
7543 | |
7544 | /* End of pcre2_match.c */ |
7545 | |