1/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2015-2022 University of Cambridge
11
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41
42#ifdef HAVE_CONFIG_H
43#include "config.h"
44#endif
45
46/* These defines enable debugging code */
47
48/* #define DEBUG_FRAMES_DISPLAY */
49/* #define DEBUG_SHOW_OPS */
50/* #define DEBUG_SHOW_RMATCH */
51
52#ifdef DEBUG_FRAMES_DISPLAY
53#include <stdarg.h>
54#endif
55
56/* These defines identify the name of the block containing "static"
57information, and fields within it. */
58
59#define NLBLOCK mb /* Block containing newline information */
60#define PSSTART start_subject /* Field containing processed string start */
61#define PSEND end_subject /* Field containing processed string end */
62
63#include "pcre2_internal.h"
64
65#define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */
66
67/* Masks for identifying the public options that are permitted at match time. */
68
69#define PUBLIC_MATCH_OPTIONS \
70 (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
71 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
72 PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT)
73
74#define PUBLIC_JIT_MATCH_OPTIONS \
75 (PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\
76 PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD|\
77 PCRE2_COPY_MATCHED_SUBJECT)
78
79/* Non-error returns from and within the match() function. Error returns are
80externally defined PCRE2_ERROR_xxx codes, which are all negative. */
81
82#define MATCH_MATCH 1
83#define MATCH_NOMATCH 0
84
85/* Special internal returns used in the match() function. Make them
86sufficiently negative to avoid the external error codes. */
87
88#define MATCH_ACCEPT (-999)
89#define MATCH_KETRPOS (-998)
90/* The next 5 must be kept together and in sequence so that a test that checks
91for any one of them can use a range. */
92#define MATCH_COMMIT (-997)
93#define MATCH_PRUNE (-996)
94#define MATCH_SKIP (-995)
95#define MATCH_SKIP_ARG (-994)
96#define MATCH_THEN (-993)
97#define MATCH_BACKTRACK_MAX MATCH_THEN
98#define MATCH_BACKTRACK_MIN MATCH_COMMIT
99
100/* Group frame type values. Zero means the frame is not a group frame. The
101lower 16 bits are used for data (e.g. the capture number). Group frames are
102used for most groups so that information about the start is easily available at
103the end without having to scan back through intermediate frames (backtrack
104points). */
105
106#define GF_CAPTURE 0x00010000u
107#define GF_NOCAPTURE 0x00020000u
108#define GF_CONDASSERT 0x00030000u
109#define GF_RECURSE 0x00040000u
110
111/* Masks for the identity and data parts of the group frame type. */
112
113#define GF_IDMASK(a) ((a) & 0xffff0000u)
114#define GF_DATAMASK(a) ((a) & 0x0000ffffu)
115
116/* Repetition types */
117
118enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS };
119
120/* Min and max values for the common repeats; a maximum of UINT32_MAX =>
121infinity. */
122
123static const uint32_t rep_min[] = {
124 0, 0, /* * and *? */
125 1, 1, /* + and +? */
126 0, 0, /* ? and ?? */
127 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
128 0, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
129
130static const uint32_t rep_max[] = {
131 UINT32_MAX, UINT32_MAX, /* * and *? */
132 UINT32_MAX, UINT32_MAX, /* + and +? */
133 1, 1, /* ? and ?? */
134 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
135 UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
136
137/* Repetition types - must include OP_CRPOSRANGE (not needed above) */
138
139static const uint32_t rep_typ[] = {
140 REPTYPE_MAX, REPTYPE_MIN, /* * and *? */
141 REPTYPE_MAX, REPTYPE_MIN, /* + and +? */
142 REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */
143 REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */
144 REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */
145 REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */
146
147/* Numbers for RMATCH calls at backtracking points. When these lists are
148changed, the code at RETURN_SWITCH below must be updated in sync. */
149
150enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
151 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
152 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
153 RM31, RM32, RM33, RM34, RM35, RM36 };
154
155#ifdef SUPPORT_WIDE_CHARS
156enum { RM100=100, RM101 };
157#endif
158
159#ifdef SUPPORT_UNICODE
160enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207,
161 RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215,
162 RM216, RM217, RM218, RM219, RM220, RM221, RM222, RM223,
163 RM224, RM225 };
164#endif
165
166/* Define short names for general fields in the current backtrack frame, which
167is always pointed to by the F variable. Occasional references to fields in
168other frames are written out explicitly. There are also some fields in the
169current frame whose names start with "temp" that are used for short-term,
170localised backtracking memory. These are #defined with Lxxx names at the point
171of use and undefined afterwards. */
172
173#define Fback_frame F->back_frame
174#define Fcapture_last F->capture_last
175#define Fcurrent_recurse F->current_recurse
176#define Fecode F->ecode
177#define Feptr F->eptr
178#define Fgroup_frame_type F->group_frame_type
179#define Flast_group_offset F->last_group_offset
180#define Flength F->length
181#define Fmark F->mark
182#define Frdepth F->rdepth
183#define Fstart_match F->start_match
184#define Foffset_top F->offset_top
185#define Foccu F->occu
186#define Fop F->op
187#define Fovector F->ovector
188#define Freturn_id F->return_id
189
190
191#ifdef DEBUG_FRAMES_DISPLAY
192/*************************************************
193* Display current frames and contents *
194*************************************************/
195
196/* This debugging function displays the current set of frames and their
197contents. It is not called automatically from anywhere, the intention being
198that calls can be inserted where necessary when debugging frame-related
199problems.
200
201Arguments:
202 f the file to write to
203 F the current top frame
204 P a previous frame of interest
205 frame_size the frame size
206 mb points to the match block
207 match_data points to the match data block
208 s identification text
209
210Returns: nothing
211*/
212
213static void
214display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size,
215 match_block *mb, pcre2_match_data *match_data, const char *s, ...)
216{
217uint32_t i;
218heapframe *Q;
219va_list ap;
220va_start(ap, s);
221
222fprintf(f, "FRAMES ");
223vfprintf(f, s, ap);
224va_end(ap);
225
226if (P != NULL) fprintf(f, " P=%lu",
227 ((char *)P - (char *)(match_data->heapframes))/frame_size);
228fprintf(f, "\n");
229
230for (i = 0, Q = match_data->heapframes;
231 Q <= F;
232 i++, Q = (heapframe *)((char *)Q + frame_size))
233 {
234 fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d",
235 i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode),
236 Q->back_frame, Q->return_id);
237
238 if (Q->last_group_offset == PCRE2_UNSET)
239 fprintf(f, " lgoffset=unset\n");
240 else
241 fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size);
242 }
243}
244
245#endif
246
247
248
249/*************************************************
250* Process a callout *
251*************************************************/
252
253/* This function is called for all callouts, whether "standalone" or at the
254start of a conditional group. Feptr will be pointing to either OP_CALLOUT or
255OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized
256with fixed values.
257
258Arguments:
259 F points to the current backtracking frame
260 mb points to the match block
261 lengthptr where to return the length of the callout item
262
263Returns: the return from the callout
264 or 0 if no callout function exists
265*/
266
267static int
268do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr)
269{
270int rc;
271PCRE2_SIZE save0, save1;
272PCRE2_SIZE *callout_ovector;
273pcre2_callout_block *cb;
274
275*lengthptr = (*Fecode == OP_CALLOUT)?
276 PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE);
277
278if (mb->callout == NULL) return 0; /* No callout function provided */
279
280/* The original matching code (pre 10.30) worked directly with the ovector
281passed by the user, and this was passed to callouts. Now that the working
282ovector is in the backtracking frame, it no longer needs to reserve space for
283the overall match offsets (which would waste space in the frame). For backward
284compatibility, however, we pass capture_top and offset_vector to the callout as
285if for the extended ovector, and we ensure that the first two slots are unset
286by preserving and restoring their current contents. Picky compilers complain if
287references such as Fovector[-2] are use directly, so we set up a separate
288pointer. */
289
290callout_ovector = (PCRE2_SIZE *)(Fovector) - 2;
291
292/* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields
293are set externally. The first 3 never change; the last is updated for each
294bumpalong. */
295
296cb = mb->cb;
297cb->capture_top = (uint32_t)Foffset_top/2 + 1;
298cb->capture_last = Fcapture_last;
299cb->offset_vector = callout_ovector;
300cb->mark = mb->nomatch_mark;
301cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject);
302cb->pattern_position = GET(Fecode, 1);
303cb->next_item_length = GET(Fecode, 1 + LINK_SIZE);
304
305if (*Fecode == OP_CALLOUT) /* Numerical callout */
306 {
307 cb->callout_number = Fecode[1 + 2*LINK_SIZE];
308 cb->callout_string_offset = 0;
309 cb->callout_string = NULL;
310 cb->callout_string_length = 0;
311 }
312else /* String callout */
313 {
314 cb->callout_number = 0;
315 cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE);
316 cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1;
317 cb->callout_string_length =
318 *lengthptr - (1 + 4*LINK_SIZE) - 2;
319 }
320
321save0 = callout_ovector[0];
322save1 = callout_ovector[1];
323callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET;
324rc = mb->callout(cb, mb->callout_data);
325callout_ovector[0] = save0;
326callout_ovector[1] = save1;
327cb->callout_flags = 0;
328return rc;
329}
330
331
332
333/*************************************************
334* Match a back-reference *
335*************************************************/
336
337/* This function is called only when it is known that the offset lies within
338the offsets that have so far been used in the match. Note that in caseless
339UTF-8 mode, the number of subject bytes matched may be different to the number
340of reference bytes. (In theory this could also happen in UTF-16 mode, but it
341seems unlikely.)
342
343Arguments:
344 offset index into the offset vector
345 caseless TRUE if caseless
346 F the current backtracking frame pointer
347 mb points to match block
348 lengthptr pointer for returning the length matched
349
350Returns: = 0 sucessful match; number of code units matched is set
351 < 0 no match
352 > 0 partial match
353*/
354
355static int
356match_ref(PCRE2_SIZE offset, BOOL caseless, heapframe *F, match_block *mb,
357 PCRE2_SIZE *lengthptr)
358{
359PCRE2_SPTR p;
360PCRE2_SIZE length;
361PCRE2_SPTR eptr;
362PCRE2_SPTR eptr_start;
363
364/* Deal with an unset group. The default is no match, but there is an option to
365match an empty string. */
366
367if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET)
368 {
369 if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
370 {
371 *lengthptr = 0;
372 return 0; /* Match */
373 }
374 else return -1; /* No match */
375 }
376
377/* Separate the caseless and UTF cases for speed. */
378
379eptr = eptr_start = Feptr;
380p = mb->start_subject + Fovector[offset];
381length = Fovector[offset+1] - Fovector[offset];
382
383if (caseless)
384 {
385#if defined SUPPORT_UNICODE
386 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
387
388 if (utf || (mb->poptions & PCRE2_UCP) != 0)
389 {
390 PCRE2_SPTR endptr = p + length;
391
392 /* Match characters up to the end of the reference. NOTE: the number of
393 code units matched may differ, because in UTF-8 there are some characters
394 whose upper and lower case codes have different numbers of bytes. For
395 example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3
396 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
397 sequence of two of the latter. It is important, therefore, to check the
398 length along the reference, not along the subject (earlier code did this
399 wrong). UCP without uses Unicode properties but without UTF encoding. */
400
401 while (p < endptr)
402 {
403 uint32_t c, d;
404 const ucd_record *ur;
405 if (eptr >= mb->end_subject) return 1; /* Partial match */
406
407 if (utf)
408 {
409 GETCHARINC(c, eptr);
410 GETCHARINC(d, p);
411 }
412 else
413 {
414 c = *eptr++;
415 d = *p++;
416 }
417
418 ur = GET_UCD(d);
419 if (c != d && c != (uint32_t)((int)d + ur->other_case))
420 {
421 const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset;
422 for (;;)
423 {
424 if (c < *pp) return -1; /* No match */
425 if (c == *pp++) break;
426 }
427 }
428 }
429 }
430 else
431#endif
432
433 /* Not in UTF or UCP mode */
434 {
435 for (; length > 0; length--)
436 {
437 uint32_t cc, cp;
438 if (eptr >= mb->end_subject) return 1; /* Partial match */
439 cc = UCHAR21TEST(eptr);
440 cp = UCHAR21TEST(p);
441 if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc))
442 return -1; /* No match */
443 p++;
444 eptr++;
445 }
446 }
447 }
448
449/* In the caseful case, we can just compare the code units, whether or not we
450are in UTF and/or UCP mode. When partial matching, we have to do this unit by
451unit. */
452
453else
454 {
455 if (mb->partial != 0)
456 {
457 for (; length > 0; length--)
458 {
459 if (eptr >= mb->end_subject) return 1; /* Partial match */
460 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */
461 }
462 }
463
464 /* Not partial matching */
465
466 else
467 {
468 if ((PCRE2_SIZE)(mb->end_subject - eptr) < length) return 1; /* Partial */
469 if (memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */
470 eptr += length;
471 }
472 }
473
474*lengthptr = eptr - eptr_start;
475return 0; /* Match */
476}
477
478
479
480/******************************************************************************
481*******************************************************************************
482 "Recursion" in the match() function
483
484The original match() function was highly recursive, but this proved to be the
485source of a number of problems over the years, mostly because of the relatively
486small system stacks that are commonly found. As new features were added to
487patterns, various kludges were invented to reduce the amount of stack used,
488making the code hard to understand in places.
489
490A version did exist that used individual frames on the heap instead of calling
491match() recursively, but this ran substantially slower. The current version is
492a refactoring that uses a vector of frames to remember backtracking points.
493This runs no slower, and possibly even a bit faster than the original recursive
494implementation.
495
496At first, an initial vector of size START_FRAMES_SIZE (enough for maybe 50
497frames) was allocated on the system stack. If this was not big enough, the heap
498was used for a larger vector. However, it turns out that there are environments
499where taking as little as 20KiB from the system stack is an embarrassment.
500After another refactoring, the heap is used exclusively, but a pointer the
501frames vector and its size are cached in the match_data block, so that there is
502no new memory allocation if the same match_data block is used for multiple
503matches (unless the frames vector has to be extended).
504*******************************************************************************
505******************************************************************************/
506
507
508
509
510/*************************************************
511* Macros for the match() function *
512*************************************************/
513
514/* These macros pack up tests that are used for partial matching several times
515in the code. The second one is used when we already know we are past the end of
516the subject. We set the "hit end" flag if the pointer is at the end of the
517subject and either (a) the pointer is past the earliest inspected character
518(i.e. something has been matched, even if not part of the actual matched
519string), or (b) the pattern contains a lookbehind. These are the conditions for
520which adding more characters may allow the current match to continue.
521
522For hard partial matching, we immediately return a partial match. Otherwise,
523carrying on means that a complete match on the current subject will be sought.
524A partial match is returned only if no complete match can be found. */
525
526#define CHECK_PARTIAL()\
527 if (Feptr >= mb->end_subject) \
528 { \
529 SCHECK_PARTIAL(); \
530 }
531
532#define SCHECK_PARTIAL()\
533 if (mb->partial != 0 && \
534 (Feptr > mb->start_used_ptr || mb->allowemptypartial)) \
535 { \
536 mb->hitend = TRUE; \
537 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \
538 }
539
540
541/* These macros are used to implement backtracking. They simulate a recursive
542call to the match() function by means of a local vector of frames which
543remember the backtracking points. */
544
545#define RMATCH(ra,rb)\
546 {\
547 start_ecode = ra;\
548 Freturn_id = rb;\
549 goto MATCH_RECURSE;\
550 L_##rb:;\
551 }
552
553#define RRETURN(ra)\
554 {\
555 rrc = ra;\
556 goto RETURN_SWITCH;\
557 }
558
559
560
561/*************************************************
562* Match from current position *
563*************************************************/
564
565/* This function is called to run one match attempt at a single starting point
566in the subject.
567
568Performance note: It might be tempting to extract commonly used fields from the
569mb structure (e.g. end_subject) into individual variables to improve
570performance. Tests using gcc on a SPARC disproved this; in the first case, it
571made performance worse.
572
573Arguments:
574 start_eptr starting character in subject
575 start_ecode starting position in compiled code
576 top_bracket number of capturing parentheses in the pattern
577 frame_size size of each backtracking frame
578 match_data pointer to the match_data block
579 mb pointer to "static" variables block
580
581Returns: MATCH_MATCH if matched ) these values are >= 0
582 MATCH_NOMATCH if failed to match )
583 negative MATCH_xxx value for PRUNE, SKIP, etc
584 negative PCRE2_ERROR_xxx value if aborted by an error condition
585 (e.g. stopped by repeated call or depth limit)
586*/
587
588static int
589match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket,
590 PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
591{
592/* Frame-handling variables */
593
594heapframe *F; /* Current frame pointer */
595heapframe *N = NULL; /* Temporary frame pointers */
596heapframe *P = NULL;
597
598heapframe *frames_top; /* End of frames vector */
599heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */
600PCRE2_SIZE heapframes_size; /* Usable size of frames vector */
601PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */
602
603/* Local variables that do not need to be preserved over calls to RRMATCH(). */
604
605PCRE2_SPTR bracode; /* Temp pointer to start of group */
606PCRE2_SIZE offset; /* Used for group offsets */
607PCRE2_SIZE length; /* Used for various length calculations */
608
609int rrc; /* Return from functions & backtracking "recursions" */
610#ifdef SUPPORT_UNICODE
611int proptype; /* Type of character property */
612#endif
613
614uint32_t i; /* Used for local loops */
615uint32_t fc; /* Character values */
616uint32_t number; /* Used for group and other numbers */
617uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */
618uint32_t group_frame_type; /* Specifies type for new group frames */
619
620BOOL condition; /* Used in conditional groups */
621BOOL cur_is_word; /* Used in "word" tests */
622BOOL prev_is_word; /* Used in "word" tests */
623
624/* UTF and UCP flags */
625
626#ifdef SUPPORT_UNICODE
627BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
628BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;
629#else
630BOOL utf = FALSE; /* Required for convenience even when no Unicode support */
631#endif
632
633/* This is the length of the last part of a backtracking frame that must be
634copied when a new frame is created. */
635
636frame_copy_size = frame_size - offsetof(heapframe, eptr);
637
638/* Set up the first frame and the end of the frames vector. We set the local
639heapframes_size to the usuable amount of the vector, that is, a whole number of
640frames. */
641
642F = match_data->heapframes;
643heapframes_size = (match_data->heapframes_size / frame_size) * frame_size;
644frames_top = (heapframe *)((char *)F + heapframes_size);
645
646Frdepth = 0; /* "Recursion" depth */
647Fcapture_last = 0; /* Number of most recent capture */
648Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */
649Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */
650Fmark = NULL; /* Most recent mark */
651Foffset_top = 0; /* End of captures within the frame */
652Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */
653group_frame_type = 0; /* Not a start of group frame */
654goto NEW_FRAME; /* Start processing with this frame */
655
656/* Come back here when we want to create a new frame for remembering a
657backtracking point. */
658
659MATCH_RECURSE:
660
661/* Set up a new backtracking frame. If the vector is full, get a new one,
662doubling the size, but constrained by the heap limit (which is in KiB). */
663
664N = (heapframe *)((char *)F + frame_size);
665if (N >= frames_top)
666 {
667 heapframe *new;
668 PCRE2_SIZE newsize = match_data->heapframes_size * 2;
669
670 if (newsize > mb->heap_limit)
671 {
672 PCRE2_SIZE maxsize = (mb->heap_limit/frame_size) * frame_size;
673 if (match_data->heapframes_size >= maxsize) return PCRE2_ERROR_HEAPLIMIT;
674 newsize = maxsize;
675 }
676
677 new = match_data->memctl.malloc(newsize, match_data->memctl.memory_data);
678 if (new == NULL) return PCRE2_ERROR_NOMEMORY;
679 memcpy(new, match_data->heapframes, heapframes_size);
680
681 F = (heapframe *)((char *)new + ((char *)F - (char *)match_data->heapframes));
682 N = (heapframe *)((char *)F + frame_size);
683
684 match_data->memctl.free(match_data->heapframes, match_data->memctl.memory_data);
685 match_data->heapframes = new;
686 match_data->heapframes_size = newsize;
687
688 heapframes_size = (newsize / frame_size) * frame_size;
689 frames_top = (heapframe *)((char *)new + heapframes_size);
690 }
691
692#ifdef DEBUG_SHOW_RMATCH
693fprintf(stderr, "++ RMATCH %2d frame=%d", Freturn_id, Frdepth + 1);
694if (group_frame_type != 0)
695 {
696 fprintf(stderr, " type=%x ", group_frame_type);
697 switch (GF_IDMASK(group_frame_type))
698 {
699 case GF_CAPTURE:
700 fprintf(stderr, "capture=%d", GF_DATAMASK(group_frame_type));
701 break;
702
703 case GF_NOCAPTURE:
704 fprintf(stderr, "nocapture op=%d", GF_DATAMASK(group_frame_type));
705 break;
706
707 case GF_CONDASSERT:
708 fprintf(stderr, "condassert op=%d", GF_DATAMASK(group_frame_type));
709 break;
710
711 case GF_RECURSE:
712 fprintf(stderr, "recurse=%d", GF_DATAMASK(group_frame_type));
713 break;
714
715 default:
716 fprintf(stderr, "*** unknown ***");
717 break;
718 }
719 }
720fprintf(stderr, "\n");
721#endif
722
723/* Copy those fields that must be copied into the new frame, increase the
724"recursion" depth (i.e. the new frame's index) and then make the new frame
725current. */
726
727memcpy((char *)N + offsetof(heapframe, eptr),
728 (char *)F + offsetof(heapframe, eptr),
729 frame_copy_size);
730
731N->rdepth = Frdepth + 1;
732F = N;
733
734/* Carry on processing with a new frame. */
735
736NEW_FRAME:
737Fgroup_frame_type = group_frame_type;
738Fecode = start_ecode; /* Starting code pointer */
739Fback_frame = frame_size; /* Default is go back one frame */
740
741/* If this is a special type of group frame, remember its offset for quick
742access at the end of the group. If this is a recursion, set a new current
743recursion value. */
744
745if (group_frame_type != 0)
746 {
747 Flast_group_offset = (char *)F - (char *)match_data->heapframes;
748 if (GF_IDMASK(group_frame_type) == GF_RECURSE)
749 Fcurrent_recurse = GF_DATAMASK(group_frame_type);
750 group_frame_type = 0;
751 }
752
753
754/* ========================================================================= */
755/* This is the main processing loop. First check that we haven't recorded too
756many backtracks (search tree is too large), or that we haven't exceeded the
757recursive depth limit (used too many backtracking frames). If not, process the
758opcodes. */
759
760if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
761if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
762
763for (;;)
764 {
765#ifdef DEBUG_SHOW_OPS
766fprintf(stderr, "++ op=%d\n", *Fecode);
767#endif
768
769 Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */
770 switch(Fop)
771 {
772 /* ===================================================================== */
773 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close
774 any currently open capturing brackets. Unlike reaching the end of a group,
775 where we know the starting frame is at the top of the chained frames, in
776 this case we have to search back for the relevant frame in case other types
777 of group that use chained frames have intervened. Multiple OP_CLOSEs always
778 come innermost first, which matches the chain order. We can ignore this in
779 a recursion, because captures are not passed out of recursions. */
780
781 case OP_CLOSE:
782 if (Fcurrent_recurse == RECURSE_UNSET)
783 {
784 number = GET2(Fecode, 1);
785 offset = Flast_group_offset;
786 for(;;)
787 {
788 if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
789 N = (heapframe *)((char *)match_data->heapframes + offset);
790 P = (heapframe *)((char *)N - frame_size);
791 if (N->group_frame_type == (GF_CAPTURE | number)) break;
792 offset = P->last_group_offset;
793 }
794 offset = (number << 1) - 2;
795 Fcapture_last = number;
796 Fovector[offset] = P->eptr - mb->start_subject;
797 Fovector[offset+1] = Feptr - mb->start_subject;
798 if (offset >= Foffset_top) Foffset_top = offset + 2;
799 }
800 Fecode += PRIV(OP_lengths)[*Fecode];
801 break;
802
803
804 /* ===================================================================== */
805 /* Real or forced end of the pattern, assertion, or recursion. In an
806 assertion ACCEPT, update the last used pointer and remember the current
807 frame so that the captures and mark can be fished out of it. */
808
809 case OP_ASSERT_ACCEPT:
810 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
811 assert_accept_frame = F;
812 RRETURN(MATCH_ACCEPT);
813
814 /* If recursing, we have to find the most recent recursion. */
815
816 case OP_ACCEPT:
817 case OP_END:
818
819 /* Handle end of a recursion. */
820
821 if (Fcurrent_recurse != RECURSE_UNSET)
822 {
823 offset = Flast_group_offset;
824 for(;;)
825 {
826 if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
827 N = (heapframe *)((char *)match_data->heapframes + offset);
828 P = (heapframe *)((char *)N - frame_size);
829 if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break;
830 offset = P->last_group_offset;
831 }
832
833 /* N is now the frame of the recursion; the previous frame is at the
834 OP_RECURSE position. Go back there, copying the current subject position
835 and mark, and the start_match position (\K might have changed it), and
836 then move on past the OP_RECURSE. */
837
838 P->eptr = Feptr;
839 P->mark = Fmark;
840 P->start_match = Fstart_match;
841 F = P;
842 Fecode += 1 + LINK_SIZE;
843 continue;
844 }
845
846 /* Not a recursion. Fail for an empty string match if either PCRE2_NOTEMPTY
847 is set, or if PCRE2_NOTEMPTY_ATSTART is set and we have matched at the
848 start of the subject. In both cases, backtracking will then try other
849 alternatives, if any. */
850
851 if (Feptr == Fstart_match &&
852 ((mb->moptions & PCRE2_NOTEMPTY) != 0 ||
853 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 &&
854 Fstart_match == mb->start_subject + mb->start_offset)))
855 RRETURN(MATCH_NOMATCH);
856
857 /* Also fail if PCRE2_ENDANCHORED is set and the end of the match is not
858 the end of the subject. After (*ACCEPT) we fail the entire match (at this
859 position) but backtrack on reaching the end of the pattern. */
860
861 if (Feptr < mb->end_subject &&
862 ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0)
863 {
864 if (Fop == OP_END) RRETURN(MATCH_NOMATCH);
865 return MATCH_NOMATCH;
866 }
867
868 /* We have a successful match of the whole pattern. Record the result and
869 then do a direct return from the function. If there is space in the offset
870 vector, set any pairs that follow the highest-numbered captured string but
871 are less than the number of capturing groups in the pattern to PCRE2_UNSET.
872 It is documented that this happens. "Gaps" are set to PCRE2_UNSET
873 dynamically. It is only those at the end that need setting here. */
874
875 mb->end_match_ptr = Feptr; /* Record where we ended */
876 mb->end_offset_top = Foffset_top; /* and how many extracts were taken */
877 mb->mark = Fmark; /* and the last success mark */
878 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
879
880 match_data->ovector[0] = Fstart_match - mb->start_subject;
881 match_data->ovector[1] = Feptr - mb->start_subject;
882
883 /* Set i to the smaller of the sizes of the external and frame ovectors. */
884
885 i = 2 * ((top_bracket + 1 > match_data->oveccount)?
886 match_data->oveccount : top_bracket + 1);
887 memcpy(match_data->ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE));
888 while (--i >= Foffset_top + 2) match_data->ovector[i] = PCRE2_UNSET;
889 return MATCH_MATCH; /* Note: NOT RRETURN */
890
891
892 /*===================================================================== */
893 /* Match any single character type except newline; have to take care with
894 CRLF newlines and partial matching. */
895
896 case OP_ANY:
897 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
898 if (mb->partial != 0 &&
899 Feptr == mb->end_subject - 1 &&
900 NLBLOCK->nltype == NLTYPE_FIXED &&
901 NLBLOCK->nllen == 2 &&
902 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
903 {
904 mb->hitend = TRUE;
905 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
906 }
907 /* Fall through */
908
909 /* Match any single character whatsoever. */
910
911 case OP_ALLANY:
912 if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
913 { /* not be updated before SCHECK_PARTIAL. */
914 SCHECK_PARTIAL();
915 RRETURN(MATCH_NOMATCH);
916 }
917 Feptr++;
918#ifdef SUPPORT_UNICODE
919 if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
920#endif
921 Fecode++;
922 break;
923
924
925 /* ===================================================================== */
926 /* Match a single code unit, even in UTF mode. This opcode really does
927 match any code unit, even newline. (It really should be called ANYCODEUNIT,
928 of course - the byte name is from pre-16 bit days.) */
929
930 case OP_ANYBYTE:
931 if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
932 { /* not be updated before SCHECK_PARTIAL. */
933 SCHECK_PARTIAL();
934 RRETURN(MATCH_NOMATCH);
935 }
936 Feptr++;
937 Fecode++;
938 break;
939
940
941 /* ===================================================================== */
942 /* Match a single character, casefully */
943
944 case OP_CHAR:
945#ifdef SUPPORT_UNICODE
946 if (utf)
947 {
948 Flength = 1;
949 Fecode++;
950 GETCHARLEN(fc, Fecode, Flength);
951 if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr))
952 {
953 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
954 RRETURN(MATCH_NOMATCH);
955 }
956 for (; Flength > 0; Flength--)
957 {
958 if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH);
959 }
960 }
961 else
962#endif
963
964 /* Not UTF mode */
965 {
966 if (mb->end_subject - Feptr < 1)
967 {
968 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
969 RRETURN(MATCH_NOMATCH);
970 }
971 if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH);
972 Fecode += 2;
973 }
974 break;
975
976
977 /* ===================================================================== */
978 /* Match a single character, caselessly. If we are at the end of the
979 subject, give up immediately. We get here only when the pattern character
980 has at most one other case. Characters with more than two cases are coded
981 as OP_PROP with the pseudo-property PT_CLIST. */
982
983 case OP_CHARI:
984 if (Feptr >= mb->end_subject)
985 {
986 SCHECK_PARTIAL();
987 RRETURN(MATCH_NOMATCH);
988 }
989
990#ifdef SUPPORT_UNICODE
991 if (utf)
992 {
993 Flength = 1;
994 Fecode++;
995 GETCHARLEN(fc, Fecode, Flength);
996
997 /* If the pattern character's value is < 128, we know that its other case
998 (if any) is also < 128 (and therefore only one code unit long in all
999 code-unit widths), so we can use the fast lookup table. We checked above
1000 that there is at least one character left in the subject. */
1001
1002 if (fc < 128)
1003 {
1004 uint32_t cc = UCHAR21(Feptr);
1005 if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
1006 Fecode++;
1007 Feptr++;
1008 }
1009
1010 /* Otherwise we must pick up the subject character and use Unicode
1011 property support to test its other case. Note that we cannot use the
1012 value of "Flength" to check for sufficient bytes left, because the other
1013 case of the character may have more or fewer code units. */
1014
1015 else
1016 {
1017 uint32_t dc;
1018 GETCHARINC(dc, Feptr);
1019 Fecode += Flength;
1020 if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
1021 }
1022 }
1023
1024 /* If UCP is set without UTF we must do the same as above, but with one
1025 character per code unit. */
1026
1027 else if (ucp)
1028 {
1029 uint32_t cc = UCHAR21(Feptr);
1030 fc = Fecode[1];
1031 if (fc < 128)
1032 {
1033 if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
1034 }
1035 else
1036 {
1037 if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
1038 }
1039 Feptr++;
1040 Fecode += 2;
1041 }
1042
1043 else
1044#endif /* SUPPORT_UNICODE */
1045
1046 /* Not UTF or UCP mode; use the table for characters < 256. */
1047 {
1048 if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
1049 != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
1050 Feptr++;
1051 Fecode += 2;
1052 }
1053 break;
1054
1055
1056 /* ===================================================================== */
1057 /* Match not a single character. */
1058
1059 case OP_NOT:
1060 case OP_NOTI:
1061 if (Feptr >= mb->end_subject)
1062 {
1063 SCHECK_PARTIAL();
1064 RRETURN(MATCH_NOMATCH);
1065 }
1066
1067#ifdef SUPPORT_UNICODE
1068 if (utf)
1069 {
1070 uint32_t ch;
1071 Fecode++;
1072 GETCHARINC(ch, Fecode);
1073 GETCHARINC(fc, Feptr);
1074 if (ch == fc)
1075 {
1076 RRETURN(MATCH_NOMATCH); /* Caseful match */
1077 }
1078 else if (Fop == OP_NOTI) /* If caseless */
1079 {
1080 if (ch > 127)
1081 ch = UCD_OTHERCASE(ch);
1082 else
1083 ch = (mb->fcc)[ch];
1084 if (ch == fc) RRETURN(MATCH_NOMATCH);
1085 }
1086 }
1087
1088 /* UCP without UTF is as above, but with one character per code unit. */
1089
1090 else if (ucp)
1091 {
1092 uint32_t ch;
1093 fc = UCHAR21INC(Feptr);
1094 ch = Fecode[1];
1095 Fecode += 2;
1096
1097 if (ch == fc)
1098 {
1099 RRETURN(MATCH_NOMATCH); /* Caseful match */
1100 }
1101 else if (Fop == OP_NOTI) /* If caseless */
1102 {
1103 if (ch > 127)
1104 ch = UCD_OTHERCASE(ch);
1105 else
1106 ch = (mb->fcc)[ch];
1107 if (ch == fc) RRETURN(MATCH_NOMATCH);
1108 }
1109 }
1110
1111 else
1112#endif /* SUPPORT_UNICODE */
1113
1114 /* Neither UTF nor UCP is set */
1115
1116 {
1117 uint32_t ch = Fecode[1];
1118 fc = UCHAR21INC(Feptr);
1119 if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
1120 RRETURN(MATCH_NOMATCH);
1121 Fecode += 2;
1122 }
1123 break;
1124
1125
1126 /* ===================================================================== */
1127 /* Match a single character repeatedly. */
1128
1129#define Loclength F->temp_size
1130#define Lstart_eptr F->temp_sptr[0]
1131#define Lcharptr F->temp_sptr[1]
1132#define Lmin F->temp_32[0]
1133#define Lmax F->temp_32[1]
1134#define Lc F->temp_32[2]
1135#define Loc F->temp_32[3]
1136
1137 case OP_EXACT:
1138 case OP_EXACTI:
1139 Lmin = Lmax = GET2(Fecode, 1);
1140 Fecode += 1 + IMM2_SIZE;
1141 goto REPEATCHAR;
1142
1143 case OP_POSUPTO:
1144 case OP_POSUPTOI:
1145 reptype = REPTYPE_POS;
1146 Lmin = 0;
1147 Lmax = GET2(Fecode, 1);
1148 Fecode += 1 + IMM2_SIZE;
1149 goto REPEATCHAR;
1150
1151 case OP_UPTO:
1152 case OP_UPTOI:
1153 reptype = REPTYPE_MAX;
1154 Lmin = 0;
1155 Lmax = GET2(Fecode, 1);
1156 Fecode += 1 + IMM2_SIZE;
1157 goto REPEATCHAR;
1158
1159 case OP_MINUPTO:
1160 case OP_MINUPTOI:
1161 reptype = REPTYPE_MIN;
1162 Lmin = 0;
1163 Lmax = GET2(Fecode, 1);
1164 Fecode += 1 + IMM2_SIZE;
1165 goto REPEATCHAR;
1166
1167 case OP_POSSTAR:
1168 case OP_POSSTARI:
1169 reptype = REPTYPE_POS;
1170 Lmin = 0;
1171 Lmax = UINT32_MAX;
1172 Fecode++;
1173 goto REPEATCHAR;
1174
1175 case OP_POSPLUS:
1176 case OP_POSPLUSI:
1177 reptype = REPTYPE_POS;
1178 Lmin = 1;
1179 Lmax = UINT32_MAX;
1180 Fecode++;
1181 goto REPEATCHAR;
1182
1183 case OP_POSQUERY:
1184 case OP_POSQUERYI:
1185 reptype = REPTYPE_POS;
1186 Lmin = 0;
1187 Lmax = 1;
1188 Fecode++;
1189 goto REPEATCHAR;
1190
1191 case OP_STAR:
1192 case OP_STARI:
1193 case OP_MINSTAR:
1194 case OP_MINSTARI:
1195 case OP_PLUS:
1196 case OP_PLUSI:
1197 case OP_MINPLUS:
1198 case OP_MINPLUSI:
1199 case OP_QUERY:
1200 case OP_QUERYI:
1201 case OP_MINQUERY:
1202 case OP_MINQUERYI:
1203 fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI);
1204 Lmin = rep_min[fc];
1205 Lmax = rep_max[fc];
1206 reptype = rep_typ[fc];
1207
1208 /* Common code for all repeated single-character matches. We first check
1209 for the minimum number of characters. If the minimum equals the maximum, we
1210 are done. Otherwise, if minimizing, check the rest of the pattern for a
1211 match; if there isn't one, advance up to the maximum, one character at a
1212 time.
1213
1214 If maximizing, advance up to the maximum number of matching characters,
1215 until Feptr is past the end of the maximum run. If possessive, we are
1216 then done (no backing up). Otherwise, match at this position; anything
1217 other than no match is immediately returned. For nomatch, back up one
1218 character, unless we are matching \R and the last thing matched was
1219 \r\n, in which case, back up two code units until we reach the first
1220 optional character position.
1221
1222 The various UTF/non-UTF and caseful/caseless cases are handled separately,
1223 for speed. */
1224
1225 REPEATCHAR:
1226#ifdef SUPPORT_UNICODE
1227 if (utf)
1228 {
1229 Flength = 1;
1230 Lcharptr = Fecode;
1231 GETCHARLEN(fc, Fecode, Flength);
1232 Fecode += Flength;
1233
1234 /* Handle multi-code-unit character matching, caseful and caseless. */
1235
1236 if (Flength > 1)
1237 {
1238 uint32_t othercase;
1239
1240 if (Fop >= OP_STARI && /* Caseless */
1241 (othercase = UCD_OTHERCASE(fc)) != fc)
1242 Loclength = PRIV(ord2utf)(othercase, Foccu);
1243 else Loclength = 0;
1244
1245 for (i = 1; i <= Lmin; i++)
1246 {
1247 if (Feptr <= mb->end_subject - Flength &&
1248 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
1249 else if (Loclength > 0 &&
1250 Feptr <= mb->end_subject - Loclength &&
1251 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1252 Feptr += Loclength;
1253 else
1254 {
1255 CHECK_PARTIAL();
1256 RRETURN(MATCH_NOMATCH);
1257 }
1258 }
1259
1260 if (Lmin == Lmax) continue;
1261
1262 if (reptype == REPTYPE_MIN)
1263 {
1264 for (;;)
1265 {
1266 RMATCH(Fecode, RM202);
1267 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1268 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1269 if (Feptr <= mb->end_subject - Flength &&
1270 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
1271 else if (Loclength > 0 &&
1272 Feptr <= mb->end_subject - Loclength &&
1273 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1274 Feptr += Loclength;
1275 else
1276 {
1277 CHECK_PARTIAL();
1278 RRETURN(MATCH_NOMATCH);
1279 }
1280 }
1281 /* Control never gets here */
1282 }
1283
1284 else /* Maximize */
1285 {
1286 Lstart_eptr = Feptr;
1287 for (i = Lmin; i < Lmax; i++)
1288 {
1289 if (Feptr <= mb->end_subject - Flength &&
1290 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0)
1291 Feptr += Flength;
1292 else if (Loclength > 0 &&
1293 Feptr <= mb->end_subject - Loclength &&
1294 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1295 Feptr += Loclength;
1296 else
1297 {
1298 CHECK_PARTIAL();
1299 break;
1300 }
1301 }
1302
1303 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1304 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1305 go too far. */
1306
1307 if (reptype != REPTYPE_POS) for(;;)
1308 {
1309 if (Feptr <= Lstart_eptr) break;
1310 RMATCH(Fecode, RM203);
1311 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1312 Feptr--;
1313 BACKCHAR(Feptr);
1314 }
1315 }
1316 break; /* End of repeated wide character handling */
1317 }
1318
1319 /* Length of UTF character is 1. Put it into the preserved variable and
1320 fall through to the non-UTF code. */
1321
1322 Lc = fc;
1323 }
1324 else
1325#endif /* SUPPORT_UNICODE */
1326
1327 /* When not in UTF mode, load a single-code-unit character. Then proceed as
1328 above, using Unicode casing if either UTF or UCP is set. */
1329
1330 Lc = *Fecode++;
1331
1332 /* Caseless comparison */
1333
1334 if (Fop >= OP_STARI)
1335 {
1336#if PCRE2_CODE_UNIT_WIDTH == 8
1337#ifdef SUPPORT_UNICODE
1338 if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
1339 else
1340#endif /* SUPPORT_UNICODE */
1341 /* Lc will be < 128 in UTF-8 mode. */
1342 Loc = mb->fcc[Lc];
1343#else /* 16-bit & 32-bit */
1344#ifdef SUPPORT_UNICODE
1345 if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);
1346 else
1347#endif /* SUPPORT_UNICODE */
1348 Loc = TABLE_GET(Lc, mb->fcc, Lc);
1349#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
1350
1351 for (i = 1; i <= Lmin; i++)
1352 {
1353 uint32_t cc; /* Faster than PCRE2_UCHAR */
1354 if (Feptr >= mb->end_subject)
1355 {
1356 SCHECK_PARTIAL();
1357 RRETURN(MATCH_NOMATCH);
1358 }
1359 cc = UCHAR21TEST(Feptr);
1360 if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
1361 Feptr++;
1362 }
1363 if (Lmin == Lmax) continue;
1364
1365 if (reptype == REPTYPE_MIN)
1366 {
1367 for (;;)
1368 {
1369 uint32_t cc; /* Faster than PCRE2_UCHAR */
1370 RMATCH(Fecode, RM25);
1371 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1372 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1373 if (Feptr >= mb->end_subject)
1374 {
1375 SCHECK_PARTIAL();
1376 RRETURN(MATCH_NOMATCH);
1377 }
1378 cc = UCHAR21TEST(Feptr);
1379 if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
1380 Feptr++;
1381 }
1382 /* Control never gets here */
1383 }
1384
1385 else /* Maximize */
1386 {
1387 Lstart_eptr = Feptr;
1388 for (i = Lmin; i < Lmax; i++)
1389 {
1390 uint32_t cc; /* Faster than PCRE2_UCHAR */
1391 if (Feptr >= mb->end_subject)
1392 {
1393 SCHECK_PARTIAL();
1394 break;
1395 }
1396 cc = UCHAR21TEST(Feptr);
1397 if (Lc != cc && Loc != cc) break;
1398 Feptr++;
1399 }
1400 if (reptype != REPTYPE_POS) for (;;)
1401 {
1402 if (Feptr == Lstart_eptr) break;
1403 RMATCH(Fecode, RM26);
1404 Feptr--;
1405 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1406 }
1407 }
1408 }
1409
1410 /* Caseful comparisons (includes all multi-byte characters) */
1411
1412 else
1413 {
1414 for (i = 1; i <= Lmin; i++)
1415 {
1416 if (Feptr >= mb->end_subject)
1417 {
1418 SCHECK_PARTIAL();
1419 RRETURN(MATCH_NOMATCH);
1420 }
1421 if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
1422 }
1423
1424 if (Lmin == Lmax) continue;
1425
1426 if (reptype == REPTYPE_MIN)
1427 {
1428 for (;;)
1429 {
1430 RMATCH(Fecode, RM27);
1431 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1432 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1433 if (Feptr >= mb->end_subject)
1434 {
1435 SCHECK_PARTIAL();
1436 RRETURN(MATCH_NOMATCH);
1437 }
1438 if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
1439 }
1440 /* Control never gets here */
1441 }
1442 else /* Maximize */
1443 {
1444 Lstart_eptr = Feptr;
1445 for (i = Lmin; i < Lmax; i++)
1446 {
1447 if (Feptr >= mb->end_subject)
1448 {
1449 SCHECK_PARTIAL();
1450 break;
1451 }
1452
1453 if (Lc != UCHAR21TEST(Feptr)) break;
1454 Feptr++;
1455 }
1456
1457 if (reptype != REPTYPE_POS) for (;;)
1458 {
1459 if (Feptr <= Lstart_eptr) break;
1460 RMATCH(Fecode, RM28);
1461 Feptr--;
1462 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1463 }
1464 }
1465 }
1466 break;
1467
1468#undef Loclength
1469#undef Lstart_eptr
1470#undef Lcharptr
1471#undef Lmin
1472#undef Lmax
1473#undef Lc
1474#undef Loc
1475
1476
1477 /* ===================================================================== */
1478 /* Match a negated single one-byte character repeatedly. This is almost a
1479 repeat of the code for a repeated single character, but I haven't found a
1480 nice way of commoning these up that doesn't require a test of the
1481 positive/negative option for each character match. Maybe that wouldn't add
1482 very much to the time taken, but character matching *is* what this is all
1483 about... */
1484
1485#define Lstart_eptr F->temp_sptr[0]
1486#define Lmin F->temp_32[0]
1487#define Lmax F->temp_32[1]
1488#define Lc F->temp_32[2]
1489#define Loc F->temp_32[3]
1490
1491 case OP_NOTEXACT:
1492 case OP_NOTEXACTI:
1493 Lmin = Lmax = GET2(Fecode, 1);
1494 Fecode += 1 + IMM2_SIZE;
1495 goto REPEATNOTCHAR;
1496
1497 case OP_NOTUPTO:
1498 case OP_NOTUPTOI:
1499 Lmin = 0;
1500 Lmax = GET2(Fecode, 1);
1501 reptype = REPTYPE_MAX;
1502 Fecode += 1 + IMM2_SIZE;
1503 goto REPEATNOTCHAR;
1504
1505 case OP_NOTMINUPTO:
1506 case OP_NOTMINUPTOI:
1507 Lmin = 0;
1508 Lmax = GET2(Fecode, 1);
1509 reptype = REPTYPE_MIN;
1510 Fecode += 1 + IMM2_SIZE;
1511 goto REPEATNOTCHAR;
1512
1513 case OP_NOTPOSSTAR:
1514 case OP_NOTPOSSTARI:
1515 reptype = REPTYPE_POS;
1516 Lmin = 0;
1517 Lmax = UINT32_MAX;
1518 Fecode++;
1519 goto REPEATNOTCHAR;
1520
1521 case OP_NOTPOSPLUS:
1522 case OP_NOTPOSPLUSI:
1523 reptype = REPTYPE_POS;
1524 Lmin = 1;
1525 Lmax = UINT32_MAX;
1526 Fecode++;
1527 goto REPEATNOTCHAR;
1528
1529 case OP_NOTPOSQUERY:
1530 case OP_NOTPOSQUERYI:
1531 reptype = REPTYPE_POS;
1532 Lmin = 0;
1533 Lmax = 1;
1534 Fecode++;
1535 goto REPEATNOTCHAR;
1536
1537 case OP_NOTPOSUPTO:
1538 case OP_NOTPOSUPTOI:
1539 reptype = REPTYPE_POS;
1540 Lmin = 0;
1541 Lmax = GET2(Fecode, 1);
1542 Fecode += 1 + IMM2_SIZE;
1543 goto REPEATNOTCHAR;
1544
1545 case OP_NOTSTAR:
1546 case OP_NOTSTARI:
1547 case OP_NOTMINSTAR:
1548 case OP_NOTMINSTARI:
1549 case OP_NOTPLUS:
1550 case OP_NOTPLUSI:
1551 case OP_NOTMINPLUS:
1552 case OP_NOTMINPLUSI:
1553 case OP_NOTQUERY:
1554 case OP_NOTQUERYI:
1555 case OP_NOTMINQUERY:
1556 case OP_NOTMINQUERYI:
1557 fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
1558 Lmin = rep_min[fc];
1559 Lmax = rep_max[fc];
1560 reptype = rep_typ[fc];
1561
1562 /* Common code for all repeated single-character non-matches. */
1563
1564 REPEATNOTCHAR:
1565 GETCHARINCTEST(Lc, Fecode);
1566
1567 /* The code is duplicated for the caseless and caseful cases, for speed,
1568 since matching characters is likely to be quite common. First, ensure the
1569 minimum number of matches are present. If Lmin = Lmax, we are done.
1570 Otherwise, if minimizing, keep trying the rest of the expression and
1571 advancing one matching character if failing, up to the maximum.
1572 Alternatively, if maximizing, find the maximum number of characters and
1573 work backwards. */
1574
1575 if (Fop >= OP_NOTSTARI) /* Caseless */
1576 {
1577#ifdef SUPPORT_UNICODE
1578 if ((utf || ucp) && Lc > 127)
1579 Loc = UCD_OTHERCASE(Lc);
1580 else
1581#endif /* SUPPORT_UNICODE */
1582
1583 Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */
1584
1585#ifdef SUPPORT_UNICODE
1586 if (utf)
1587 {
1588 uint32_t d;
1589 for (i = 1; i <= Lmin; i++)
1590 {
1591 if (Feptr >= mb->end_subject)
1592 {
1593 SCHECK_PARTIAL();
1594 RRETURN(MATCH_NOMATCH);
1595 }
1596 GETCHARINC(d, Feptr);
1597 if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
1598 }
1599 }
1600 else
1601#endif /* SUPPORT_UNICODE */
1602
1603 /* Not UTF mode */
1604 {
1605 for (i = 1; i <= Lmin; i++)
1606 {
1607 if (Feptr >= mb->end_subject)
1608 {
1609 SCHECK_PARTIAL();
1610 RRETURN(MATCH_NOMATCH);
1611 }
1612 if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
1613 Feptr++;
1614 }
1615 }
1616
1617 if (Lmin == Lmax) continue; /* Finished for exact count */
1618
1619 if (reptype == REPTYPE_MIN)
1620 {
1621#ifdef SUPPORT_UNICODE
1622 if (utf)
1623 {
1624 uint32_t d;
1625 for (;;)
1626 {
1627 RMATCH(Fecode, RM204);
1628 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1629 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1630 if (Feptr >= mb->end_subject)
1631 {
1632 SCHECK_PARTIAL();
1633 RRETURN(MATCH_NOMATCH);
1634 }
1635 GETCHARINC(d, Feptr);
1636 if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
1637 }
1638 }
1639 else
1640#endif /*SUPPORT_UNICODE */
1641
1642 /* Not UTF mode */
1643 {
1644 for (;;)
1645 {
1646 RMATCH(Fecode, RM29);
1647 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1648 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1649 if (Feptr >= mb->end_subject)
1650 {
1651 SCHECK_PARTIAL();
1652 RRETURN(MATCH_NOMATCH);
1653 }
1654 if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
1655 Feptr++;
1656 }
1657 }
1658 /* Control never gets here */
1659 }
1660
1661 /* Maximize case */
1662
1663 else
1664 {
1665 Lstart_eptr = Feptr;
1666
1667#ifdef SUPPORT_UNICODE
1668 if (utf)
1669 {
1670 uint32_t d;
1671 for (i = Lmin; i < Lmax; i++)
1672 {
1673 int len = 1;
1674 if (Feptr >= mb->end_subject)
1675 {
1676 SCHECK_PARTIAL();
1677 break;
1678 }
1679 GETCHARLEN(d, Feptr, len);
1680 if (Lc == d || Loc == d) break;
1681 Feptr += len;
1682 }
1683
1684 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1685 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1686 go too far. */
1687
1688 if (reptype != REPTYPE_POS) for(;;)
1689 {
1690 if (Feptr <= Lstart_eptr) break;
1691 RMATCH(Fecode, RM205);
1692 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1693 Feptr--;
1694 BACKCHAR(Feptr);
1695 }
1696 }
1697 else
1698#endif /* SUPPORT_UNICODE */
1699
1700 /* Not UTF mode */
1701 {
1702 for (i = Lmin; i < Lmax; i++)
1703 {
1704 if (Feptr >= mb->end_subject)
1705 {
1706 SCHECK_PARTIAL();
1707 break;
1708 }
1709 if (Lc == *Feptr || Loc == *Feptr) break;
1710 Feptr++;
1711 }
1712 if (reptype != REPTYPE_POS) for (;;)
1713 {
1714 if (Feptr == Lstart_eptr) break;
1715 RMATCH(Fecode, RM30);
1716 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1717 Feptr--;
1718 }
1719 }
1720 }
1721 }
1722
1723 /* Caseful comparisons */
1724
1725 else
1726 {
1727#ifdef SUPPORT_UNICODE
1728 if (utf)
1729 {
1730 uint32_t d;
1731 for (i = 1; i <= Lmin; i++)
1732 {
1733 if (Feptr >= mb->end_subject)
1734 {
1735 SCHECK_PARTIAL();
1736 RRETURN(MATCH_NOMATCH);
1737 }
1738 GETCHARINC(d, Feptr);
1739 if (Lc == d) RRETURN(MATCH_NOMATCH);
1740 }
1741 }
1742 else
1743#endif
1744 /* Not UTF mode */
1745 {
1746 for (i = 1; i <= Lmin; i++)
1747 {
1748 if (Feptr >= mb->end_subject)
1749 {
1750 SCHECK_PARTIAL();
1751 RRETURN(MATCH_NOMATCH);
1752 }
1753 if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
1754 }
1755 }
1756
1757 if (Lmin == Lmax) continue;
1758
1759 if (reptype == REPTYPE_MIN)
1760 {
1761#ifdef SUPPORT_UNICODE
1762 if (utf)
1763 {
1764 uint32_t d;
1765 for (;;)
1766 {
1767 RMATCH(Fecode, RM206);
1768 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1769 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1770 if (Feptr >= mb->end_subject)
1771 {
1772 SCHECK_PARTIAL();
1773 RRETURN(MATCH_NOMATCH);
1774 }
1775 GETCHARINC(d, Feptr);
1776 if (Lc == d) RRETURN(MATCH_NOMATCH);
1777 }
1778 }
1779 else
1780#endif
1781 /* Not UTF mode */
1782 {
1783 for (;;)
1784 {
1785 RMATCH(Fecode, RM31);
1786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1787 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1788 if (Feptr >= mb->end_subject)
1789 {
1790 SCHECK_PARTIAL();
1791 RRETURN(MATCH_NOMATCH);
1792 }
1793 if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
1794 }
1795 }
1796 /* Control never gets here */
1797 }
1798
1799 /* Maximize case */
1800
1801 else
1802 {
1803 Lstart_eptr = Feptr;
1804
1805#ifdef SUPPORT_UNICODE
1806 if (utf)
1807 {
1808 uint32_t d;
1809 for (i = Lmin; i < Lmax; i++)
1810 {
1811 int len = 1;
1812 if (Feptr >= mb->end_subject)
1813 {
1814 SCHECK_PARTIAL();
1815 break;
1816 }
1817 GETCHARLEN(d, Feptr, len);
1818 if (Lc == d) break;
1819 Feptr += len;
1820 }
1821
1822 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1823 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1824 go too far. */
1825
1826 if (reptype != REPTYPE_POS) for(;;)
1827 {
1828 if (Feptr <= Lstart_eptr) break;
1829 RMATCH(Fecode, RM207);
1830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1831 Feptr--;
1832 BACKCHAR(Feptr);
1833 }
1834 }
1835 else
1836#endif
1837 /* Not UTF mode */
1838 {
1839 for (i = Lmin; i < Lmax; i++)
1840 {
1841 if (Feptr >= mb->end_subject)
1842 {
1843 SCHECK_PARTIAL();
1844 break;
1845 }
1846 if (Lc == *Feptr) break;
1847 Feptr++;
1848 }
1849 if (reptype != REPTYPE_POS) for (;;)
1850 {
1851 if (Feptr == Lstart_eptr) break;
1852 RMATCH(Fecode, RM32);
1853 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1854 Feptr--;
1855 }
1856 }
1857 }
1858 }
1859 break;
1860
1861#undef Lstart_eptr
1862#undef Lmin
1863#undef Lmax
1864#undef Lc
1865#undef Loc
1866
1867
1868 /* ===================================================================== */
1869 /* Match a bit-mapped character class, possibly repeatedly. These opcodes
1870 are used when all the characters in the class have values in the range
1871 0-255, and either the matching is caseful, or the characters are in the
1872 range 0-127 when UTF processing is enabled. The only difference between
1873 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1874 encountered. */
1875
1876#define Lmin F->temp_32[0]
1877#define Lmax F->temp_32[1]
1878#define Lstart_eptr F->temp_sptr[0]
1879#define Lbyte_map_address F->temp_sptr[1]
1880#define Lbyte_map ((unsigned char *)Lbyte_map_address)
1881
1882 case OP_NCLASS:
1883 case OP_CLASS:
1884 {
1885 Lbyte_map_address = Fecode + 1; /* Save for matching */
1886 Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */
1887
1888 /* Look past the end of the item to see if there is repeat information
1889 following. Then obey similar code to character type repeats. */
1890
1891 switch (*Fecode)
1892 {
1893 case OP_CRSTAR:
1894 case OP_CRMINSTAR:
1895 case OP_CRPLUS:
1896 case OP_CRMINPLUS:
1897 case OP_CRQUERY:
1898 case OP_CRMINQUERY:
1899 case OP_CRPOSSTAR:
1900 case OP_CRPOSPLUS:
1901 case OP_CRPOSQUERY:
1902 fc = *Fecode++ - OP_CRSTAR;
1903 Lmin = rep_min[fc];
1904 Lmax = rep_max[fc];
1905 reptype = rep_typ[fc];
1906 break;
1907
1908 case OP_CRRANGE:
1909 case OP_CRMINRANGE:
1910 case OP_CRPOSRANGE:
1911 Lmin = GET2(Fecode, 1);
1912 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
1913 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
1914 reptype = rep_typ[*Fecode - OP_CRSTAR];
1915 Fecode += 1 + 2 * IMM2_SIZE;
1916 break;
1917
1918 default: /* No repeat follows */
1919 Lmin = Lmax = 1;
1920 break;
1921 }
1922
1923 /* First, ensure the minimum number of matches are present. */
1924
1925#ifdef SUPPORT_UNICODE
1926 if (utf)
1927 {
1928 for (i = 1; i <= Lmin; i++)
1929 {
1930 if (Feptr >= mb->end_subject)
1931 {
1932 SCHECK_PARTIAL();
1933 RRETURN(MATCH_NOMATCH);
1934 }
1935 GETCHARINC(fc, Feptr);
1936 if (fc > 255)
1937 {
1938 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1939 }
1940 else
1941 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1942 }
1943 }
1944 else
1945#endif
1946 /* Not UTF mode */
1947 {
1948 for (i = 1; i <= Lmin; i++)
1949 {
1950 if (Feptr >= mb->end_subject)
1951 {
1952 SCHECK_PARTIAL();
1953 RRETURN(MATCH_NOMATCH);
1954 }
1955 fc = *Feptr++;
1956#if PCRE2_CODE_UNIT_WIDTH != 8
1957 if (fc > 255)
1958 {
1959 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1960 }
1961 else
1962#endif
1963 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1964 }
1965 }
1966
1967 /* If Lmax == Lmin we are done. Continue with main loop. */
1968
1969 if (Lmin == Lmax) continue;
1970
1971 /* If minimizing, keep testing the rest of the expression and advancing
1972 the pointer while it matches the class. */
1973
1974 if (reptype == REPTYPE_MIN)
1975 {
1976#ifdef SUPPORT_UNICODE
1977 if (utf)
1978 {
1979 for (;;)
1980 {
1981 RMATCH(Fecode, RM200);
1982 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1983 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1984 if (Feptr >= mb->end_subject)
1985 {
1986 SCHECK_PARTIAL();
1987 RRETURN(MATCH_NOMATCH);
1988 }
1989 GETCHARINC(fc, Feptr);
1990 if (fc > 255)
1991 {
1992 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1993 }
1994 else
1995 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1996 }
1997 }
1998 else
1999#endif
2000 /* Not UTF mode */
2001 {
2002 for (;;)
2003 {
2004 RMATCH(Fecode, RM23);
2005 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2006 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
2007 if (Feptr >= mb->end_subject)
2008 {
2009 SCHECK_PARTIAL();
2010 RRETURN(MATCH_NOMATCH);
2011 }
2012 fc = *Feptr++;
2013#if PCRE2_CODE_UNIT_WIDTH != 8
2014 if (fc > 255)
2015 {
2016 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
2017 }
2018 else
2019#endif
2020 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
2021 }
2022 }
2023 /* Control never gets here */
2024 }
2025
2026 /* If maximizing, find the longest possible run, then work backwards. */
2027
2028 else
2029 {
2030 Lstart_eptr = Feptr;
2031
2032#ifdef SUPPORT_UNICODE
2033 if (utf)
2034 {
2035 for (i = Lmin; i < Lmax; i++)
2036 {
2037 int len = 1;
2038 if (Feptr >= mb->end_subject)
2039 {
2040 SCHECK_PARTIAL();
2041 break;
2042 }
2043 GETCHARLEN(fc, Feptr, len);
2044 if (fc > 255)
2045 {
2046 if (Fop == OP_CLASS) break;
2047 }
2048 else
2049 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;
2050 Feptr += len;
2051 }
2052
2053 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2054
2055 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
2056 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
2057 go too far. */
2058
2059 for (;;)
2060 {
2061 RMATCH(Fecode, RM201);
2062 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2063 if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
2064 BACKCHAR(Feptr);
2065 }
2066 }
2067 else
2068#endif
2069 /* Not UTF mode */
2070 {
2071 for (i = Lmin; i < Lmax; i++)
2072 {
2073 if (Feptr >= mb->end_subject)
2074 {
2075 SCHECK_PARTIAL();
2076 break;
2077 }
2078 fc = *Feptr;
2079#if PCRE2_CODE_UNIT_WIDTH != 8
2080 if (fc > 255)
2081 {
2082 if (Fop == OP_CLASS) break;
2083 }
2084 else
2085#endif
2086 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;
2087 Feptr++;
2088 }
2089
2090 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2091
2092 while (Feptr >= Lstart_eptr)
2093 {
2094 RMATCH(Fecode, RM24);
2095 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2096 Feptr--;
2097 }
2098 }
2099
2100 RRETURN(MATCH_NOMATCH);
2101 }
2102 }
2103 /* Control never gets here */
2104
2105#undef Lbyte_map_address
2106#undef Lbyte_map
2107#undef Lstart_eptr
2108#undef Lmin
2109#undef Lmax
2110
2111
2112 /* ===================================================================== */
2113 /* Match an extended character class. In the 8-bit library, this opcode is
2114 encountered only when UTF-8 mode mode is supported. In the 16-bit and
2115 32-bit libraries, codepoints greater than 255 may be encountered even when
2116 UTF is not supported. */
2117
2118#define Lstart_eptr F->temp_sptr[0]
2119#define Lxclass_data F->temp_sptr[1]
2120#define Lmin F->temp_32[0]
2121#define Lmax F->temp_32[1]
2122
2123#ifdef SUPPORT_WIDE_CHARS
2124 case OP_XCLASS:
2125 {
2126 Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */
2127 Fecode += GET(Fecode, 1); /* Advance past the item */
2128
2129 switch (*Fecode)
2130 {
2131 case OP_CRSTAR:
2132 case OP_CRMINSTAR:
2133 case OP_CRPLUS:
2134 case OP_CRMINPLUS:
2135 case OP_CRQUERY:
2136 case OP_CRMINQUERY:
2137 case OP_CRPOSSTAR:
2138 case OP_CRPOSPLUS:
2139 case OP_CRPOSQUERY:
2140 fc = *Fecode++ - OP_CRSTAR;
2141 Lmin = rep_min[fc];
2142 Lmax = rep_max[fc];
2143 reptype = rep_typ[fc];
2144 break;
2145
2146 case OP_CRRANGE:
2147 case OP_CRMINRANGE:
2148 case OP_CRPOSRANGE:
2149 Lmin = GET2(Fecode, 1);
2150 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
2151 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
2152 reptype = rep_typ[*Fecode - OP_CRSTAR];
2153 Fecode += 1 + 2 * IMM2_SIZE;
2154 break;
2155
2156 default: /* No repeat follows */
2157 Lmin = Lmax = 1;
2158 break;
2159 }
2160
2161 /* First, ensure the minimum number of matches are present. */
2162
2163 for (i = 1; i <= Lmin; i++)
2164 {
2165 if (Feptr >= mb->end_subject)
2166 {
2167 SCHECK_PARTIAL();
2168 RRETURN(MATCH_NOMATCH);
2169 }
2170 GETCHARINCTEST(fc, Feptr);
2171 if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH);
2172 }
2173
2174 /* If Lmax == Lmin we can just continue with the main loop. */
2175
2176 if (Lmin == Lmax) continue;
2177
2178 /* If minimizing, keep testing the rest of the expression and advancing
2179 the pointer while it matches the class. */
2180
2181 if (reptype == REPTYPE_MIN)
2182 {
2183 for (;;)
2184 {
2185 RMATCH(Fecode, RM100);
2186 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2187 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
2188 if (Feptr >= mb->end_subject)
2189 {
2190 SCHECK_PARTIAL();
2191 RRETURN(MATCH_NOMATCH);
2192 }
2193 GETCHARINCTEST(fc, Feptr);
2194 if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH);
2195 }
2196 /* Control never gets here */
2197 }
2198
2199 /* If maximizing, find the longest possible run, then work backwards. */
2200
2201 else
2202 {
2203 Lstart_eptr = Feptr;
2204 for (i = Lmin; i < Lmax; i++)
2205 {
2206 int len = 1;
2207 if (Feptr >= mb->end_subject)
2208 {
2209 SCHECK_PARTIAL();
2210 break;
2211 }
2212#ifdef SUPPORT_UNICODE
2213 GETCHARLENTEST(fc, Feptr, len);
2214#else
2215 fc = *Feptr;
2216#endif
2217 if (!PRIV(xclass)(fc, Lxclass_data, utf)) break;
2218 Feptr += len;
2219 }
2220
2221 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2222
2223 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
2224 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
2225 go too far. */
2226
2227 for(;;)
2228 {
2229 RMATCH(Fecode, RM101);
2230 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2231 if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
2232#ifdef SUPPORT_UNICODE
2233 if (utf) BACKCHAR(Feptr);
2234#endif
2235 }
2236 RRETURN(MATCH_NOMATCH);
2237 }
2238
2239 /* Control never gets here */
2240 }
2241#endif /* SUPPORT_WIDE_CHARS: end of XCLASS */
2242
2243#undef Lstart_eptr
2244#undef Lxclass_data
2245#undef Lmin
2246#undef Lmax
2247
2248
2249 /* ===================================================================== */
2250 /* Match various character types when PCRE2_UCP is not set. These opcodes
2251 are not generated when PCRE2_UCP is set - instead appropriate property
2252 tests are compiled. */
2253
2254 case OP_NOT_DIGIT:
2255 if (Feptr >= mb->end_subject)
2256 {
2257 SCHECK_PARTIAL();
2258 RRETURN(MATCH_NOMATCH);
2259 }
2260 GETCHARINCTEST(fc, Feptr);
2261 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
2262 RRETURN(MATCH_NOMATCH);
2263 Fecode++;
2264 break;
2265
2266 case OP_DIGIT:
2267 if (Feptr >= mb->end_subject)
2268 {
2269 SCHECK_PARTIAL();
2270 RRETURN(MATCH_NOMATCH);
2271 }
2272 GETCHARINCTEST(fc, Feptr);
2273 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
2274 RRETURN(MATCH_NOMATCH);
2275 Fecode++;
2276 break;
2277
2278 case OP_NOT_WHITESPACE:
2279 if (Feptr >= mb->end_subject)
2280 {
2281 SCHECK_PARTIAL();
2282 RRETURN(MATCH_NOMATCH);
2283 }
2284 GETCHARINCTEST(fc, Feptr);
2285 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
2286 RRETURN(MATCH_NOMATCH);
2287 Fecode++;
2288 break;
2289
2290 case OP_WHITESPACE:
2291 if (Feptr >= mb->end_subject)
2292 {
2293 SCHECK_PARTIAL();
2294 RRETURN(MATCH_NOMATCH);
2295 }
2296 GETCHARINCTEST(fc, Feptr);
2297 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
2298 RRETURN(MATCH_NOMATCH);
2299 Fecode++;
2300 break;
2301
2302 case OP_NOT_WORDCHAR:
2303 if (Feptr >= mb->end_subject)
2304 {
2305 SCHECK_PARTIAL();
2306 RRETURN(MATCH_NOMATCH);
2307 }
2308 GETCHARINCTEST(fc, Feptr);
2309 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
2310 RRETURN(MATCH_NOMATCH);
2311 Fecode++;
2312 break;
2313
2314 case OP_WORDCHAR:
2315 if (Feptr >= mb->end_subject)
2316 {
2317 SCHECK_PARTIAL();
2318 RRETURN(MATCH_NOMATCH);
2319 }
2320 GETCHARINCTEST(fc, Feptr);
2321 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
2322 RRETURN(MATCH_NOMATCH);
2323 Fecode++;
2324 break;
2325
2326 case OP_ANYNL:
2327 if (Feptr >= mb->end_subject)
2328 {
2329 SCHECK_PARTIAL();
2330 RRETURN(MATCH_NOMATCH);
2331 }
2332 GETCHARINCTEST(fc, Feptr);
2333 switch(fc)
2334 {
2335 default: RRETURN(MATCH_NOMATCH);
2336
2337 case CHAR_CR:
2338 if (Feptr >= mb->end_subject)
2339 {
2340 SCHECK_PARTIAL();
2341 }
2342 else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++;
2343 break;
2344
2345 case CHAR_LF:
2346 break;
2347
2348 case CHAR_VT:
2349 case CHAR_FF:
2350 case CHAR_NEL:
2351#ifndef EBCDIC
2352 case 0x2028:
2353 case 0x2029:
2354#endif /* Not EBCDIC */
2355 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
2356 break;
2357 }
2358 Fecode++;
2359 break;
2360
2361 case OP_NOT_HSPACE:
2362 if (Feptr >= mb->end_subject)
2363 {
2364 SCHECK_PARTIAL();
2365 RRETURN(MATCH_NOMATCH);
2366 }
2367 GETCHARINCTEST(fc, Feptr);
2368 switch(fc)
2369 {
2370 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2371 default: break;
2372 }
2373 Fecode++;
2374 break;
2375
2376 case OP_HSPACE:
2377 if (Feptr >= mb->end_subject)
2378 {
2379 SCHECK_PARTIAL();
2380 RRETURN(MATCH_NOMATCH);
2381 }
2382 GETCHARINCTEST(fc, Feptr);
2383 switch(fc)
2384 {
2385 HSPACE_CASES: break; /* Byte and multibyte cases */
2386 default: RRETURN(MATCH_NOMATCH);
2387 }
2388 Fecode++;
2389 break;
2390
2391 case OP_NOT_VSPACE:
2392 if (Feptr >= mb->end_subject)
2393 {
2394 SCHECK_PARTIAL();
2395 RRETURN(MATCH_NOMATCH);
2396 }
2397 GETCHARINCTEST(fc, Feptr);
2398 switch(fc)
2399 {
2400 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2401 default: break;
2402 }
2403 Fecode++;
2404 break;
2405
2406 case OP_VSPACE:
2407 if (Feptr >= mb->end_subject)
2408 {
2409 SCHECK_PARTIAL();
2410 RRETURN(MATCH_NOMATCH);
2411 }
2412 GETCHARINCTEST(fc, Feptr);
2413 switch(fc)
2414 {
2415 VSPACE_CASES: break;
2416 default: RRETURN(MATCH_NOMATCH);
2417 }
2418 Fecode++;
2419 break;
2420
2421
2422#ifdef SUPPORT_UNICODE
2423
2424 /* ===================================================================== */
2425 /* Check the next character by Unicode property. We will get here only
2426 if the support is in the binary; otherwise a compile-time error occurs. */
2427
2428 case OP_PROP:
2429 case OP_NOTPROP:
2430 if (Feptr >= mb->end_subject)
2431 {
2432 SCHECK_PARTIAL();
2433 RRETURN(MATCH_NOMATCH);
2434 }
2435 GETCHARINCTEST(fc, Feptr);
2436 {
2437 const uint32_t *cp;
2438 const ucd_record *prop = GET_UCD(fc);
2439 BOOL notmatch = Fop == OP_NOTPROP;
2440
2441 switch(Fecode[1])
2442 {
2443 case PT_ANY:
2444 if (notmatch) RRETURN(MATCH_NOMATCH);
2445 break;
2446
2447 case PT_LAMP:
2448 if ((prop->chartype == ucp_Lu ||
2449 prop->chartype == ucp_Ll ||
2450 prop->chartype == ucp_Lt) == notmatch)
2451 RRETURN(MATCH_NOMATCH);
2452 break;
2453
2454 case PT_GC:
2455 if ((Fecode[2] == PRIV(ucp_gentype)[prop->chartype]) == notmatch)
2456 RRETURN(MATCH_NOMATCH);
2457 break;
2458
2459 case PT_PC:
2460 if ((Fecode[2] == prop->chartype) == notmatch)
2461 RRETURN(MATCH_NOMATCH);
2462 break;
2463
2464 case PT_SC:
2465 if ((Fecode[2] == prop->script) == notmatch)
2466 RRETURN(MATCH_NOMATCH);
2467 break;
2468
2469 case PT_SCX:
2470 {
2471 BOOL ok = (Fecode[2] == prop->script ||
2472 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Fecode[2]) != 0);
2473 if (ok == notmatch) RRETURN(MATCH_NOMATCH);
2474 }
2475 break;
2476
2477 /* These are specials */
2478
2479 case PT_ALNUM:
2480 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2481 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == notmatch)
2482 RRETURN(MATCH_NOMATCH);
2483 break;
2484
2485 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2486 which means that Perl space and POSIX space are now identical. PCRE
2487 was changed at release 8.34. */
2488
2489 case PT_SPACE: /* Perl space */
2490 case PT_PXSPACE: /* POSIX space */
2491 switch(fc)
2492 {
2493 HSPACE_CASES:
2494 VSPACE_CASES:
2495 if (notmatch) RRETURN(MATCH_NOMATCH);
2496 break;
2497
2498 default:
2499 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == notmatch)
2500 RRETURN(MATCH_NOMATCH);
2501 break;
2502 }
2503 break;
2504
2505 case PT_WORD:
2506 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2507 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2508 fc == CHAR_UNDERSCORE) == notmatch)
2509 RRETURN(MATCH_NOMATCH);
2510 break;
2511
2512 case PT_CLIST:
2513 cp = PRIV(ucd_caseless_sets) + Fecode[2];
2514 for (;;)
2515 {
2516 if (fc < *cp)
2517 { if (notmatch) break; else { RRETURN(MATCH_NOMATCH); } }
2518 if (fc == *cp++)
2519 { if (notmatch) { RRETURN(MATCH_NOMATCH); } else break; }
2520 }
2521 break;
2522
2523 case PT_UCNC:
2524 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
2525 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
2526 fc >= 0xe000) == notmatch)
2527 RRETURN(MATCH_NOMATCH);
2528 break;
2529
2530 case PT_BIDICL:
2531 if ((UCD_BIDICLASS_PROP(prop) == Fecode[2]) == notmatch)
2532 RRETURN(MATCH_NOMATCH);
2533 break;
2534
2535 case PT_BOOL:
2536 {
2537 BOOL ok = MAPBIT(PRIV(ucd_boolprop_sets) +
2538 UCD_BPROPS_PROP(prop), Fecode[2]) != 0;
2539 if (ok == notmatch) RRETURN(MATCH_NOMATCH);
2540 }
2541 break;
2542
2543 /* This should never occur */
2544
2545 default:
2546 return PCRE2_ERROR_INTERNAL;
2547 }
2548
2549 Fecode += 3;
2550 }
2551 break;
2552
2553
2554 /* ===================================================================== */
2555 /* Match an extended Unicode sequence. We will get here only if the support
2556 is in the binary; otherwise a compile-time error occurs. */
2557
2558 case OP_EXTUNI:
2559 if (Feptr >= mb->end_subject)
2560 {
2561 SCHECK_PARTIAL();
2562 RRETURN(MATCH_NOMATCH);
2563 }
2564 else
2565 {
2566 GETCHARINCTEST(fc, Feptr);
2567 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf,
2568 NULL);
2569 }
2570 CHECK_PARTIAL();
2571 Fecode++;
2572 break;
2573
2574#endif /* SUPPORT_UNICODE */
2575
2576
2577 /* ===================================================================== */
2578 /* Match a single character type repeatedly. Note that the property type
2579 does not need to be in a stack frame as it is not used within an RMATCH()
2580 loop. */
2581
2582#define Lstart_eptr F->temp_sptr[0]
2583#define Lmin F->temp_32[0]
2584#define Lmax F->temp_32[1]
2585#define Lctype F->temp_32[2]
2586#define Lpropvalue F->temp_32[3]
2587
2588 case OP_TYPEEXACT:
2589 Lmin = Lmax = GET2(Fecode, 1);
2590 Fecode += 1 + IMM2_SIZE;
2591 goto REPEATTYPE;
2592
2593 case OP_TYPEUPTO:
2594 case OP_TYPEMINUPTO:
2595 Lmin = 0;
2596 Lmax = GET2(Fecode, 1);
2597 reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX;
2598 Fecode += 1 + IMM2_SIZE;
2599 goto REPEATTYPE;
2600
2601 case OP_TYPEPOSSTAR:
2602 reptype = REPTYPE_POS;
2603 Lmin = 0;
2604 Lmax = UINT32_MAX;
2605 Fecode++;
2606 goto REPEATTYPE;
2607
2608 case OP_TYPEPOSPLUS:
2609 reptype = REPTYPE_POS;
2610 Lmin = 1;
2611 Lmax = UINT32_MAX;
2612 Fecode++;
2613 goto REPEATTYPE;
2614
2615 case OP_TYPEPOSQUERY:
2616 reptype = REPTYPE_POS;
2617 Lmin = 0;
2618 Lmax = 1;
2619 Fecode++;
2620 goto REPEATTYPE;
2621
2622 case OP_TYPEPOSUPTO:
2623 reptype = REPTYPE_POS;
2624 Lmin = 0;
2625 Lmax = GET2(Fecode, 1);
2626 Fecode += 1 + IMM2_SIZE;
2627 goto REPEATTYPE;
2628
2629 case OP_TYPESTAR:
2630 case OP_TYPEMINSTAR:
2631 case OP_TYPEPLUS:
2632 case OP_TYPEMINPLUS:
2633 case OP_TYPEQUERY:
2634 case OP_TYPEMINQUERY:
2635 fc = *Fecode++ - OP_TYPESTAR;
2636 Lmin = rep_min[fc];
2637 Lmax = rep_max[fc];
2638 reptype = rep_typ[fc];
2639
2640 /* Common code for all repeated character type matches. */
2641
2642 REPEATTYPE:
2643 Lctype = *Fecode++; /* Code for the character type */
2644
2645#ifdef SUPPORT_UNICODE
2646 if (Lctype == OP_PROP || Lctype == OP_NOTPROP)
2647 {
2648 proptype = *Fecode++;
2649 Lpropvalue = *Fecode++;
2650 }
2651 else proptype = -1;
2652#endif
2653
2654 /* First, ensure the minimum number of matches are present. Use inline
2655 code for maximizing the speed, and do the type test once at the start
2656 (i.e. keep it out of the loops). As there are no calls to RMATCH in the
2657 loops, we can use an ordinary variable for "notmatch". The code for UTF
2658 mode is separated out for tidiness, except for Unicode property tests. */
2659
2660 if (Lmin > 0)
2661 {
2662#ifdef SUPPORT_UNICODE
2663 if (proptype >= 0) /* Property tests in all modes */
2664 {
2665 BOOL notmatch = Lctype == OP_NOTPROP;
2666 switch(proptype)
2667 {
2668 case PT_ANY:
2669 if (notmatch) RRETURN(MATCH_NOMATCH);
2670 for (i = 1; i <= Lmin; i++)
2671 {
2672 if (Feptr >= mb->end_subject)
2673 {
2674 SCHECK_PARTIAL();
2675 RRETURN(MATCH_NOMATCH);
2676 }
2677 GETCHARINCTEST(fc, Feptr);
2678 }
2679 break;
2680
2681 case PT_LAMP:
2682 for (i = 1; i <= Lmin; i++)
2683 {
2684 int chartype;
2685 if (Feptr >= mb->end_subject)
2686 {
2687 SCHECK_PARTIAL();
2688 RRETURN(MATCH_NOMATCH);
2689 }
2690 GETCHARINCTEST(fc, Feptr);
2691 chartype = UCD_CHARTYPE(fc);
2692 if ((chartype == ucp_Lu ||
2693 chartype == ucp_Ll ||
2694 chartype == ucp_Lt) == notmatch)
2695 RRETURN(MATCH_NOMATCH);
2696 }
2697 break;
2698
2699 case PT_GC:
2700 for (i = 1; i <= Lmin; i++)
2701 {
2702 if (Feptr >= mb->end_subject)
2703 {
2704 SCHECK_PARTIAL();
2705 RRETURN(MATCH_NOMATCH);
2706 }
2707 GETCHARINCTEST(fc, Feptr);
2708 if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch)
2709 RRETURN(MATCH_NOMATCH);
2710 }
2711 break;
2712
2713 case PT_PC:
2714 for (i = 1; i <= Lmin; i++)
2715 {
2716 if (Feptr >= mb->end_subject)
2717 {
2718 SCHECK_PARTIAL();
2719 RRETURN(MATCH_NOMATCH);
2720 }
2721 GETCHARINCTEST(fc, Feptr);
2722 if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch)
2723 RRETURN(MATCH_NOMATCH);
2724 }
2725 break;
2726
2727 case PT_SC:
2728 for (i = 1; i <= Lmin; i++)
2729 {
2730 if (Feptr >= mb->end_subject)
2731 {
2732 SCHECK_PARTIAL();
2733 RRETURN(MATCH_NOMATCH);
2734 }
2735 GETCHARINCTEST(fc, Feptr);
2736 if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch)
2737 RRETURN(MATCH_NOMATCH);
2738 }
2739 break;
2740
2741 case PT_SCX:
2742 for (i = 1; i <= Lmin; i++)
2743 {
2744 BOOL ok;
2745 const ucd_record *prop;
2746 if (Feptr >= mb->end_subject)
2747 {
2748 SCHECK_PARTIAL();
2749 RRETURN(MATCH_NOMATCH);
2750 }
2751 GETCHARINCTEST(fc, Feptr);
2752 prop = GET_UCD(fc);
2753 ok = (prop->script == Lpropvalue ||
2754 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);
2755 if (ok == notmatch)
2756 RRETURN(MATCH_NOMATCH);
2757 }
2758 break;
2759
2760 case PT_ALNUM:
2761 for (i = 1; i <= Lmin; i++)
2762 {
2763 int category;
2764 if (Feptr >= mb->end_subject)
2765 {
2766 SCHECK_PARTIAL();
2767 RRETURN(MATCH_NOMATCH);
2768 }
2769 GETCHARINCTEST(fc, Feptr);
2770 category = UCD_CATEGORY(fc);
2771 if ((category == ucp_L || category == ucp_N) == notmatch)
2772 RRETURN(MATCH_NOMATCH);
2773 }
2774 break;
2775
2776 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2777 which means that Perl space and POSIX space are now identical. PCRE
2778 was changed at release 8.34. */
2779
2780 case PT_SPACE: /* Perl space */
2781 case PT_PXSPACE: /* POSIX space */
2782 for (i = 1; i <= Lmin; i++)
2783 {
2784 if (Feptr >= mb->end_subject)
2785 {
2786 SCHECK_PARTIAL();
2787 RRETURN(MATCH_NOMATCH);
2788 }
2789 GETCHARINCTEST(fc, Feptr);
2790 switch(fc)
2791 {
2792 HSPACE_CASES:
2793 VSPACE_CASES:
2794 if (notmatch) RRETURN(MATCH_NOMATCH);
2795 break;
2796
2797 default:
2798 if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch)
2799 RRETURN(MATCH_NOMATCH);
2800 break;
2801 }
2802 }
2803 break;
2804
2805 case PT_WORD:
2806 for (i = 1; i <= Lmin; i++)
2807 {
2808 int category;
2809 if (Feptr >= mb->end_subject)
2810 {
2811 SCHECK_PARTIAL();
2812 RRETURN(MATCH_NOMATCH);
2813 }
2814 GETCHARINCTEST(fc, Feptr);
2815 category = UCD_CATEGORY(fc);
2816 if ((category == ucp_L || category == ucp_N ||
2817 fc == CHAR_UNDERSCORE) == notmatch)
2818 RRETURN(MATCH_NOMATCH);
2819 }
2820 break;
2821
2822 case PT_CLIST:
2823 for (i = 1; i <= Lmin; i++)
2824 {
2825 const uint32_t *cp;
2826 if (Feptr >= mb->end_subject)
2827 {
2828 SCHECK_PARTIAL();
2829 RRETURN(MATCH_NOMATCH);
2830 }
2831 GETCHARINCTEST(fc, Feptr);
2832 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
2833 for (;;)
2834 {
2835 if (fc < *cp)
2836 {
2837 if (notmatch) break;
2838 RRETURN(MATCH_NOMATCH);
2839 }
2840 if (fc == *cp++)
2841 {
2842 if (notmatch) RRETURN(MATCH_NOMATCH);
2843 break;
2844 }
2845 }
2846 }
2847 break;
2848
2849 case PT_UCNC:
2850 for (i = 1; i <= Lmin; i++)
2851 {
2852 if (Feptr >= mb->end_subject)
2853 {
2854 SCHECK_PARTIAL();
2855 RRETURN(MATCH_NOMATCH);
2856 }
2857 GETCHARINCTEST(fc, Feptr);
2858 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
2859 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
2860 fc >= 0xe000) == notmatch)
2861 RRETURN(MATCH_NOMATCH);
2862 }
2863 break;
2864
2865 case PT_BIDICL:
2866 for (i = 1; i <= Lmin; i++)
2867 {
2868 if (Feptr >= mb->end_subject)
2869 {
2870 SCHECK_PARTIAL();
2871 RRETURN(MATCH_NOMATCH);
2872 }
2873 GETCHARINCTEST(fc, Feptr);
2874 if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch)
2875 RRETURN(MATCH_NOMATCH);
2876 }
2877 break;
2878
2879 case PT_BOOL:
2880 for (i = 1; i <= Lmin; i++)
2881 {
2882 BOOL ok;
2883 const ucd_record *prop;
2884 if (Feptr >= mb->end_subject)
2885 {
2886 SCHECK_PARTIAL();
2887 RRETURN(MATCH_NOMATCH);
2888 }
2889 GETCHARINCTEST(fc, Feptr);
2890 prop = GET_UCD(fc);
2891 ok = MAPBIT(PRIV(ucd_boolprop_sets) +
2892 UCD_BPROPS_PROP(prop), Lpropvalue) != 0;
2893 if (ok == notmatch)
2894 RRETURN(MATCH_NOMATCH);
2895 }
2896 break;
2897
2898 /* This should not occur */
2899
2900 default:
2901 return PCRE2_ERROR_INTERNAL;
2902 }
2903 }
2904
2905 /* Match extended Unicode sequences. We will get here only if the
2906 support is in the binary; otherwise a compile-time error occurs. */
2907
2908 else if (Lctype == OP_EXTUNI)
2909 {
2910 for (i = 1; i <= Lmin; i++)
2911 {
2912 if (Feptr >= mb->end_subject)
2913 {
2914 SCHECK_PARTIAL();
2915 RRETURN(MATCH_NOMATCH);
2916 }
2917 else
2918 {
2919 GETCHARINCTEST(fc, Feptr);
2920 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject,
2921 mb->end_subject, utf, NULL);
2922 }
2923 CHECK_PARTIAL();
2924 }
2925 }
2926 else
2927#endif /* SUPPORT_UNICODE */
2928
2929/* Handle all other cases in UTF mode */
2930
2931#ifdef SUPPORT_UNICODE
2932 if (utf) switch(Lctype)
2933 {
2934 case OP_ANY:
2935 for (i = 1; i <= Lmin; i++)
2936 {
2937 if (Feptr >= mb->end_subject)
2938 {
2939 SCHECK_PARTIAL();
2940 RRETURN(MATCH_NOMATCH);
2941 }
2942 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
2943 if (mb->partial != 0 &&
2944 Feptr + 1 >= mb->end_subject &&
2945 NLBLOCK->nltype == NLTYPE_FIXED &&
2946 NLBLOCK->nllen == 2 &&
2947 UCHAR21(Feptr) == NLBLOCK->nl[0])
2948 {
2949 mb->hitend = TRUE;
2950 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
2951 }
2952 Feptr++;
2953 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
2954 }
2955 break;
2956
2957 case OP_ALLANY:
2958 for (i = 1; i <= Lmin; i++)
2959 {
2960 if (Feptr >= mb->end_subject)
2961 {
2962 SCHECK_PARTIAL();
2963 RRETURN(MATCH_NOMATCH);
2964 }
2965 Feptr++;
2966 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
2967 }
2968 break;
2969
2970 case OP_ANYBYTE:
2971 if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH);
2972 Feptr += Lmin;
2973 break;
2974
2975 case OP_ANYNL:
2976 for (i = 1; i <= Lmin; i++)
2977 {
2978 if (Feptr >= mb->end_subject)
2979 {
2980 SCHECK_PARTIAL();
2981 RRETURN(MATCH_NOMATCH);
2982 }
2983 GETCHARINC(fc, Feptr);
2984 switch(fc)
2985 {
2986 default: RRETURN(MATCH_NOMATCH);
2987
2988 case CHAR_CR:
2989 if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
2990 break;
2991
2992 case CHAR_LF:
2993 break;
2994
2995 case CHAR_VT:
2996 case CHAR_FF:
2997 case CHAR_NEL:
2998#ifndef EBCDIC
2999 case 0x2028:
3000 case 0x2029:
3001#endif /* Not EBCDIC */
3002 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
3003 break;
3004 }
3005 }
3006 break;
3007
3008 case OP_NOT_HSPACE:
3009 for (i = 1; i <= Lmin; i++)
3010 {
3011 if (Feptr >= mb->end_subject)
3012 {
3013 SCHECK_PARTIAL();
3014 RRETURN(MATCH_NOMATCH);
3015 }
3016 GETCHARINC(fc, Feptr);
3017 switch(fc)
3018 {
3019 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
3020 default: break;
3021 }
3022 }
3023 break;
3024
3025 case OP_HSPACE:
3026 for (i = 1; i <= Lmin; i++)
3027 {
3028 if (Feptr >= mb->end_subject)
3029 {
3030 SCHECK_PARTIAL();
3031 RRETURN(MATCH_NOMATCH);
3032 }
3033 GETCHARINC(fc, Feptr);
3034 switch(fc)
3035 {
3036 HSPACE_CASES: break;
3037 default: RRETURN(MATCH_NOMATCH);
3038 }
3039 }
3040 break;
3041
3042 case OP_NOT_VSPACE:
3043 for (i = 1; i <= Lmin; i++)
3044 {
3045 if (Feptr >= mb->end_subject)
3046 {
3047 SCHECK_PARTIAL();
3048 RRETURN(MATCH_NOMATCH);
3049 }
3050 GETCHARINC(fc, Feptr);
3051 switch(fc)
3052 {
3053 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
3054 default: break;
3055 }
3056 }
3057 break;
3058
3059 case OP_VSPACE:
3060 for (i = 1; i <= Lmin; i++)
3061 {
3062 if (Feptr >= mb->end_subject)
3063 {
3064 SCHECK_PARTIAL();
3065 RRETURN(MATCH_NOMATCH);
3066 }
3067 GETCHARINC(fc, Feptr);
3068 switch(fc)
3069 {
3070 VSPACE_CASES: break;
3071 default: RRETURN(MATCH_NOMATCH);
3072 }
3073 }
3074 break;
3075
3076 case OP_NOT_DIGIT:
3077 for (i = 1; i <= Lmin; i++)
3078 {
3079 if (Feptr >= mb->end_subject)
3080 {
3081 SCHECK_PARTIAL();
3082 RRETURN(MATCH_NOMATCH);
3083 }
3084 GETCHARINC(fc, Feptr);
3085 if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0)
3086 RRETURN(MATCH_NOMATCH);
3087 }
3088 break;
3089
3090 case OP_DIGIT:
3091 for (i = 1; i <= Lmin; i++)
3092 {
3093 uint32_t cc;
3094 if (Feptr >= mb->end_subject)
3095 {
3096 SCHECK_PARTIAL();
3097 RRETURN(MATCH_NOMATCH);
3098 }
3099 cc = UCHAR21(Feptr);
3100 if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0)
3101 RRETURN(MATCH_NOMATCH);
3102 Feptr++;
3103 /* No need to skip more code units - we know it has only one. */
3104 }
3105 break;
3106
3107 case OP_NOT_WHITESPACE:
3108 for (i = 1; i <= Lmin; i++)
3109 {
3110 uint32_t cc;
3111 if (Feptr >= mb->end_subject)
3112 {
3113 SCHECK_PARTIAL();
3114 RRETURN(MATCH_NOMATCH);
3115 }
3116 cc = UCHAR21(Feptr);
3117 if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0)
3118 RRETURN(MATCH_NOMATCH);
3119 Feptr++;
3120 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3121 }
3122 break;
3123
3124 case OP_WHITESPACE:
3125 for (i = 1; i <= Lmin; i++)
3126 {
3127 uint32_t cc;
3128 if (Feptr >= mb->end_subject)
3129 {
3130 SCHECK_PARTIAL();
3131 RRETURN(MATCH_NOMATCH);
3132 }
3133 cc = UCHAR21(Feptr);
3134 if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0)
3135 RRETURN(MATCH_NOMATCH);
3136 Feptr++;
3137 /* No need to skip more code units - we know it has only one. */
3138 }
3139 break;
3140
3141 case OP_NOT_WORDCHAR:
3142 for (i = 1; i <= Lmin; i++)
3143 {
3144 uint32_t cc;
3145 if (Feptr >= mb->end_subject)
3146 {
3147 SCHECK_PARTIAL();
3148 RRETURN(MATCH_NOMATCH);
3149 }
3150 cc = UCHAR21(Feptr);
3151 if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0)
3152 RRETURN(MATCH_NOMATCH);
3153 Feptr++;
3154 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3155 }
3156 break;
3157
3158 case OP_WORDCHAR:
3159 for (i = 1; i <= Lmin; i++)
3160 {
3161 uint32_t cc;
3162 if (Feptr >= mb->end_subject)
3163 {
3164 SCHECK_PARTIAL();
3165 RRETURN(MATCH_NOMATCH);
3166 }
3167 cc = UCHAR21(Feptr);
3168 if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0)
3169 RRETURN(MATCH_NOMATCH);
3170 Feptr++;
3171 /* No need to skip more code units - we know it has only one. */
3172 }
3173 break;
3174
3175 default:
3176 return PCRE2_ERROR_INTERNAL;
3177 } /* End switch(Lctype) */
3178
3179 else
3180#endif /* SUPPORT_UNICODE */
3181
3182 /* Code for the non-UTF case for minimum matching of operators other
3183 than OP_PROP and OP_NOTPROP. */
3184
3185 switch(Lctype)
3186 {
3187 case OP_ANY:
3188 for (i = 1; i <= Lmin; i++)
3189 {
3190 if (Feptr >= mb->end_subject)
3191 {
3192 SCHECK_PARTIAL();
3193 RRETURN(MATCH_NOMATCH);
3194 }
3195 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3196 if (mb->partial != 0 &&
3197 Feptr + 1 >= mb->end_subject &&
3198 NLBLOCK->nltype == NLTYPE_FIXED &&
3199 NLBLOCK->nllen == 2 &&
3200 *Feptr == NLBLOCK->nl[0])
3201 {
3202 mb->hitend = TRUE;
3203 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3204 }
3205 Feptr++;
3206 }
3207 break;
3208
3209 case OP_ALLANY:
3210 if (Feptr > mb->end_subject - Lmin)
3211 {
3212 SCHECK_PARTIAL();
3213 RRETURN(MATCH_NOMATCH);
3214 }
3215 Feptr += Lmin;
3216 break;
3217
3218 /* This OP_ANYBYTE case will never be reached because \C gets turned
3219 into OP_ALLANY in non-UTF mode. Cut out the code so that coverage
3220 reports don't complain about it's never being used. */
3221
3222/* case OP_ANYBYTE:
3223* if (Feptr > mb->end_subject - Lmin)
3224* {
3225* SCHECK_PARTIAL();
3226* RRETURN(MATCH_NOMATCH);
3227* }
3228* Feptr += Lmin;
3229* break;
3230*/
3231 case OP_ANYNL:
3232 for (i = 1; i <= Lmin; i++)
3233 {
3234 if (Feptr >= mb->end_subject)
3235 {
3236 SCHECK_PARTIAL();
3237 RRETURN(MATCH_NOMATCH);
3238 }
3239 switch(*Feptr++)
3240 {
3241 default: RRETURN(MATCH_NOMATCH);
3242
3243 case CHAR_CR:
3244 if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
3245 break;
3246
3247 case CHAR_LF:
3248 break;
3249
3250 case CHAR_VT:
3251 case CHAR_FF:
3252 case CHAR_NEL:
3253#if PCRE2_CODE_UNIT_WIDTH != 8
3254 case 0x2028:
3255 case 0x2029:
3256#endif
3257 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
3258 break;
3259 }
3260 }
3261 break;
3262
3263 case OP_NOT_HSPACE:
3264 for (i = 1; i <= Lmin; i++)
3265 {
3266 if (Feptr >= mb->end_subject)
3267 {
3268 SCHECK_PARTIAL();
3269 RRETURN(MATCH_NOMATCH);
3270 }
3271 switch(*Feptr++)
3272 {
3273 default: break;
3274 HSPACE_BYTE_CASES:
3275#if PCRE2_CODE_UNIT_WIDTH != 8
3276 HSPACE_MULTIBYTE_CASES:
3277#endif
3278 RRETURN(MATCH_NOMATCH);
3279 }
3280 }
3281 break;
3282
3283 case OP_HSPACE:
3284 for (i = 1; i <= Lmin; i++)
3285 {
3286 if (Feptr >= mb->end_subject)
3287 {
3288 SCHECK_PARTIAL();
3289 RRETURN(MATCH_NOMATCH);
3290 }
3291 switch(*Feptr++)
3292 {
3293 default: RRETURN(MATCH_NOMATCH);
3294 HSPACE_BYTE_CASES:
3295#if PCRE2_CODE_UNIT_WIDTH != 8
3296 HSPACE_MULTIBYTE_CASES:
3297#endif
3298 break;
3299 }
3300 }
3301 break;
3302
3303 case OP_NOT_VSPACE:
3304 for (i = 1; i <= Lmin; i++)
3305 {
3306 if (Feptr >= mb->end_subject)
3307 {
3308 SCHECK_PARTIAL();
3309 RRETURN(MATCH_NOMATCH);
3310 }
3311 switch(*Feptr++)
3312 {
3313 VSPACE_BYTE_CASES:
3314#if PCRE2_CODE_UNIT_WIDTH != 8
3315 VSPACE_MULTIBYTE_CASES:
3316#endif
3317 RRETURN(MATCH_NOMATCH);
3318 default: break;
3319 }
3320 }
3321 break;
3322
3323 case OP_VSPACE:
3324 for (i = 1; i <= Lmin; i++)
3325 {
3326 if (Feptr >= mb->end_subject)
3327 {
3328 SCHECK_PARTIAL();
3329 RRETURN(MATCH_NOMATCH);
3330 }
3331 switch(*Feptr++)
3332 {
3333 default: RRETURN(MATCH_NOMATCH);
3334 VSPACE_BYTE_CASES:
3335#if PCRE2_CODE_UNIT_WIDTH != 8
3336 VSPACE_MULTIBYTE_CASES:
3337#endif
3338 break;
3339 }
3340 }
3341 break;
3342
3343 case OP_NOT_DIGIT:
3344 for (i = 1; i <= Lmin; i++)
3345 {
3346 if (Feptr >= mb->end_subject)
3347 {
3348 SCHECK_PARTIAL();
3349 RRETURN(MATCH_NOMATCH);
3350 }
3351 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
3352 RRETURN(MATCH_NOMATCH);
3353 Feptr++;
3354 }
3355 break;
3356
3357 case OP_DIGIT:
3358 for (i = 1; i <= Lmin; i++)
3359 {
3360 if (Feptr >= mb->end_subject)
3361 {
3362 SCHECK_PARTIAL();
3363 RRETURN(MATCH_NOMATCH);
3364 }
3365 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
3366 RRETURN(MATCH_NOMATCH);
3367 Feptr++;
3368 }
3369 break;
3370
3371 case OP_NOT_WHITESPACE:
3372 for (i = 1; i <= Lmin; i++)
3373 {
3374 if (Feptr >= mb->end_subject)
3375 {
3376 SCHECK_PARTIAL();
3377 RRETURN(MATCH_NOMATCH);
3378 }
3379 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
3380 RRETURN(MATCH_NOMATCH);
3381 Feptr++;
3382 }
3383 break;
3384
3385 case OP_WHITESPACE:
3386 for (i = 1; i <= Lmin; i++)
3387 {
3388 if (Feptr >= mb->end_subject)
3389 {
3390 SCHECK_PARTIAL();
3391 RRETURN(MATCH_NOMATCH);
3392 }
3393 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
3394 RRETURN(MATCH_NOMATCH);
3395 Feptr++;
3396 }
3397 break;
3398
3399 case OP_NOT_WORDCHAR:
3400 for (i = 1; i <= Lmin; i++)
3401 {
3402 if (Feptr >= mb->end_subject)
3403 {
3404 SCHECK_PARTIAL();
3405 RRETURN(MATCH_NOMATCH);
3406 }
3407 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
3408 RRETURN(MATCH_NOMATCH);
3409 Feptr++;
3410 }
3411 break;
3412
3413 case OP_WORDCHAR:
3414 for (i = 1; i <= Lmin; i++)
3415 {
3416 if (Feptr >= mb->end_subject)
3417 {
3418 SCHECK_PARTIAL();
3419 RRETURN(MATCH_NOMATCH);
3420 }
3421 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
3422 RRETURN(MATCH_NOMATCH);
3423 Feptr++;
3424 }
3425 break;
3426
3427 default:
3428 return PCRE2_ERROR_INTERNAL;
3429 }
3430 }
3431
3432 /* If Lmin = Lmax we are done. Continue with the main loop. */
3433
3434 if (Lmin == Lmax) continue;
3435
3436 /* If minimizing, we have to test the rest of the pattern before each
3437 subsequent match. This means we cannot use a local "notmatch" variable as
3438 in the other cases. As all 4 temporary 32-bit values in the frame are
3439 already in use, just test the type each time. */
3440
3441 if (reptype == REPTYPE_MIN)
3442 {
3443#ifdef SUPPORT_UNICODE
3444 if (proptype >= 0)
3445 {
3446 switch(proptype)
3447 {
3448 case PT_ANY:
3449 for (;;)
3450 {
3451 RMATCH(Fecode, RM208);
3452 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3453 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3454 if (Feptr >= mb->end_subject)
3455 {
3456 SCHECK_PARTIAL();
3457 RRETURN(MATCH_NOMATCH);
3458 }
3459 GETCHARINCTEST(fc, Feptr);
3460 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3461 }
3462 /* Control never gets here */
3463
3464 case PT_LAMP:
3465 for (;;)
3466 {
3467 int chartype;
3468 RMATCH(Fecode, RM209);
3469 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3470 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3471 if (Feptr >= mb->end_subject)
3472 {
3473 SCHECK_PARTIAL();
3474 RRETURN(MATCH_NOMATCH);
3475 }
3476 GETCHARINCTEST(fc, Feptr);
3477 chartype = UCD_CHARTYPE(fc);
3478 if ((chartype == ucp_Lu ||
3479 chartype == ucp_Ll ||
3480 chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
3481 RRETURN(MATCH_NOMATCH);
3482 }
3483 /* Control never gets here */
3484
3485 case PT_GC:
3486 for (;;)
3487 {
3488 RMATCH(Fecode, RM210);
3489 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3490 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3491 if (Feptr >= mb->end_subject)
3492 {
3493 SCHECK_PARTIAL();
3494 RRETURN(MATCH_NOMATCH);
3495 }
3496 GETCHARINCTEST(fc, Feptr);
3497 if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3498 RRETURN(MATCH_NOMATCH);
3499 }
3500 /* Control never gets here */
3501
3502 case PT_PC:
3503 for (;;)
3504 {
3505 RMATCH(Fecode, RM211);
3506 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3507 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3508 if (Feptr >= mb->end_subject)
3509 {
3510 SCHECK_PARTIAL();
3511 RRETURN(MATCH_NOMATCH);
3512 }
3513 GETCHARINCTEST(fc, Feptr);
3514 if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3515 RRETURN(MATCH_NOMATCH);
3516 }
3517 /* Control never gets here */
3518
3519 case PT_SC:
3520 for (;;)
3521 {
3522 RMATCH(Fecode, RM212);
3523 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3524 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3525 if (Feptr >= mb->end_subject)
3526 {
3527 SCHECK_PARTIAL();
3528 RRETURN(MATCH_NOMATCH);
3529 }
3530 GETCHARINCTEST(fc, Feptr);
3531 if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3532 RRETURN(MATCH_NOMATCH);
3533 }
3534 /* Control never gets here */
3535
3536 case PT_SCX:
3537 for (;;)
3538 {
3539 BOOL ok;
3540 const ucd_record *prop;
3541 RMATCH(Fecode, RM225);
3542 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3543 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3544 if (Feptr >= mb->end_subject)
3545 {
3546 SCHECK_PARTIAL();
3547 RRETURN(MATCH_NOMATCH);
3548 }
3549 GETCHARINCTEST(fc, Feptr);
3550 prop = GET_UCD(fc);
3551 ok = (prop->script == Lpropvalue
3552 || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);
3553 if (ok == (Lctype == OP_NOTPROP))
3554 RRETURN(MATCH_NOMATCH);
3555 }
3556 /* Control never gets here */
3557
3558 case PT_ALNUM:
3559 for (;;)
3560 {
3561 int category;
3562 RMATCH(Fecode, RM213);
3563 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3564 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3565 if (Feptr >= mb->end_subject)
3566 {
3567 SCHECK_PARTIAL();
3568 RRETURN(MATCH_NOMATCH);
3569 }
3570 GETCHARINCTEST(fc, Feptr);
3571 category = UCD_CATEGORY(fc);
3572 if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP))
3573 RRETURN(MATCH_NOMATCH);
3574 }
3575 /* Control never gets here */
3576
3577 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
3578 which means that Perl space and POSIX space are now identical. PCRE
3579 was changed at release 8.34. */
3580
3581 case PT_SPACE: /* Perl space */
3582 case PT_PXSPACE: /* POSIX space */
3583 for (;;)
3584 {
3585 RMATCH(Fecode, RM214);
3586 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3587 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3588 if (Feptr >= mb->end_subject)
3589 {
3590 SCHECK_PARTIAL();
3591 RRETURN(MATCH_NOMATCH);
3592 }
3593 GETCHARINCTEST(fc, Feptr);
3594 switch(fc)
3595 {
3596 HSPACE_CASES:
3597 VSPACE_CASES:
3598 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3599 break;
3600
3601 default:
3602 if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
3603 RRETURN(MATCH_NOMATCH);
3604 break;
3605 }
3606 }
3607 /* Control never gets here */
3608
3609 case PT_WORD:
3610 for (;;)
3611 {
3612 int category;
3613 RMATCH(Fecode, RM215);
3614 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3615 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3616 if (Feptr >= mb->end_subject)
3617 {
3618 SCHECK_PARTIAL();
3619 RRETURN(MATCH_NOMATCH);
3620 }
3621 GETCHARINCTEST(fc, Feptr);
3622 category = UCD_CATEGORY(fc);
3623 if ((category == ucp_L ||
3624 category == ucp_N ||
3625 fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
3626 RRETURN(MATCH_NOMATCH);
3627 }
3628 /* Control never gets here */
3629
3630 case PT_CLIST:
3631 for (;;)
3632 {
3633 const uint32_t *cp;
3634 RMATCH(Fecode, RM216);
3635 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3636 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3637 if (Feptr >= mb->end_subject)
3638 {
3639 SCHECK_PARTIAL();
3640 RRETURN(MATCH_NOMATCH);
3641 }
3642 GETCHARINCTEST(fc, Feptr);
3643 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
3644 for (;;)
3645 {
3646 if (fc < *cp)
3647 {
3648 if (Lctype == OP_NOTPROP) break;
3649 RRETURN(MATCH_NOMATCH);
3650 }
3651 if (fc == *cp++)
3652 {
3653 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3654 break;
3655 }
3656 }
3657 }
3658 /* Control never gets here */
3659
3660 case PT_UCNC:
3661 for (;;)
3662 {
3663 RMATCH(Fecode, RM217);
3664 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3665 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3666 if (Feptr >= mb->end_subject)
3667 {
3668 SCHECK_PARTIAL();
3669 RRETURN(MATCH_NOMATCH);
3670 }
3671 GETCHARINCTEST(fc, Feptr);
3672 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
3673 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
3674 fc >= 0xe000) == (Lctype == OP_NOTPROP))
3675 RRETURN(MATCH_NOMATCH);
3676 }
3677 /* Control never gets here */
3678
3679 case PT_BIDICL:
3680 for (;;)
3681 {
3682 RMATCH(Fecode, RM224);
3683 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3684 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3685 if (Feptr >= mb->end_subject)
3686 {
3687 SCHECK_PARTIAL();
3688 RRETURN(MATCH_NOMATCH);
3689 }
3690 GETCHARINCTEST(fc, Feptr);
3691 if ((UCD_BIDICLASS(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3692 RRETURN(MATCH_NOMATCH);
3693 }
3694 /* Control never gets here */
3695
3696 case PT_BOOL:
3697 for (;;)
3698 {
3699 BOOL ok;
3700 const ucd_record *prop;
3701 RMATCH(Fecode, RM223);
3702 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3703 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3704 if (Feptr >= mb->end_subject)
3705 {
3706 SCHECK_PARTIAL();
3707 RRETURN(MATCH_NOMATCH);
3708 }
3709 GETCHARINCTEST(fc, Feptr);
3710 prop = GET_UCD(fc);
3711 ok = MAPBIT(PRIV(ucd_boolprop_sets) +
3712 UCD_BPROPS_PROP(prop), Lpropvalue) != 0;
3713 if (ok == (Lctype == OP_NOTPROP))
3714 RRETURN(MATCH_NOMATCH);
3715 }
3716 /* Control never gets here */
3717
3718 /* This should never occur */
3719 default:
3720 return PCRE2_ERROR_INTERNAL;
3721 }
3722 }
3723
3724 /* Match extended Unicode sequences. We will get here only if the
3725 support is in the binary; otherwise a compile-time error occurs. */
3726
3727 else if (Lctype == OP_EXTUNI)
3728 {
3729 for (;;)
3730 {
3731 RMATCH(Fecode, RM218);
3732 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3733 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3734 if (Feptr >= mb->end_subject)
3735 {
3736 SCHECK_PARTIAL();
3737 RRETURN(MATCH_NOMATCH);
3738 }
3739 else
3740 {
3741 GETCHARINCTEST(fc, Feptr);
3742 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
3743 utf, NULL);
3744 }
3745 CHECK_PARTIAL();
3746 }
3747 }
3748 else
3749#endif /* SUPPORT_UNICODE */
3750
3751 /* UTF mode for non-property testing character types. */
3752
3753#ifdef SUPPORT_UNICODE
3754 if (utf)
3755 {
3756 for (;;)
3757 {
3758 RMATCH(Fecode, RM219);
3759 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3760 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3761 if (Feptr >= mb->end_subject)
3762 {
3763 SCHECK_PARTIAL();
3764 RRETURN(MATCH_NOMATCH);
3765 }
3766 if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3767 GETCHARINC(fc, Feptr);
3768 switch(Lctype)
3769 {
3770 case OP_ANY: /* This is the non-NL case */
3771 if (mb->partial != 0 && /* Take care with CRLF partial */
3772 Feptr >= mb->end_subject &&
3773 NLBLOCK->nltype == NLTYPE_FIXED &&
3774 NLBLOCK->nllen == 2 &&
3775 fc == NLBLOCK->nl[0])
3776 {
3777 mb->hitend = TRUE;
3778 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3779 }
3780 break;
3781
3782 case OP_ALLANY:
3783 case OP_ANYBYTE:
3784 break;
3785
3786 case OP_ANYNL:
3787 switch(fc)
3788 {
3789 default: RRETURN(MATCH_NOMATCH);
3790
3791 case CHAR_CR:
3792 if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
3793 break;
3794
3795 case CHAR_LF:
3796 break;
3797
3798 case CHAR_VT:
3799 case CHAR_FF:
3800 case CHAR_NEL:
3801#ifndef EBCDIC
3802 case 0x2028:
3803 case 0x2029:
3804#endif /* Not EBCDIC */
3805 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
3806 RRETURN(MATCH_NOMATCH);
3807 break;
3808 }
3809 break;
3810
3811 case OP_NOT_HSPACE:
3812 switch(fc)
3813 {
3814 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
3815 default: break;
3816 }
3817 break;
3818
3819 case OP_HSPACE:
3820 switch(fc)
3821 {
3822 HSPACE_CASES: break;
3823 default: RRETURN(MATCH_NOMATCH);
3824 }
3825 break;
3826
3827 case OP_NOT_VSPACE:
3828 switch(fc)
3829 {
3830 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
3831 default: break;
3832 }
3833 break;
3834
3835 case OP_VSPACE:
3836 switch(fc)
3837 {
3838 VSPACE_CASES: break;
3839 default: RRETURN(MATCH_NOMATCH);
3840 }
3841 break;
3842
3843 case OP_NOT_DIGIT:
3844 if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0)
3845 RRETURN(MATCH_NOMATCH);
3846 break;
3847
3848 case OP_DIGIT:
3849 if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0)
3850 RRETURN(MATCH_NOMATCH);
3851 break;
3852
3853 case OP_NOT_WHITESPACE:
3854 if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0)
3855 RRETURN(MATCH_NOMATCH);
3856 break;
3857
3858 case OP_WHITESPACE:
3859 if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0)
3860 RRETURN(MATCH_NOMATCH);
3861 break;
3862
3863 case OP_NOT_WORDCHAR:
3864 if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0)
3865 RRETURN(MATCH_NOMATCH);
3866 break;
3867
3868 case OP_WORDCHAR:
3869 if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0)
3870 RRETURN(MATCH_NOMATCH);
3871 break;
3872
3873 default:
3874 return PCRE2_ERROR_INTERNAL;
3875 }
3876 }
3877 }
3878 else
3879#endif /* SUPPORT_UNICODE */
3880
3881 /* Not UTF mode */
3882 {
3883 for (;;)
3884 {
3885 RMATCH(Fecode, RM33);
3886 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3887 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3888 if (Feptr >= mb->end_subject)
3889 {
3890 SCHECK_PARTIAL();
3891 RRETURN(MATCH_NOMATCH);
3892 }
3893 if (Lctype == OP_ANY && IS_NEWLINE(Feptr))
3894 RRETURN(MATCH_NOMATCH);
3895 fc = *Feptr++;
3896 switch(Lctype)
3897 {
3898 case OP_ANY: /* This is the non-NL case */
3899 if (mb->partial != 0 && /* Take care with CRLF partial */
3900 Feptr >= mb->end_subject &&
3901 NLBLOCK->nltype == NLTYPE_FIXED &&
3902 NLBLOCK->nllen == 2 &&
3903 fc == NLBLOCK->nl[0])
3904 {
3905 mb->hitend = TRUE;
3906 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3907 }
3908 break;
3909
3910 case OP_ALLANY:
3911 case OP_ANYBYTE:
3912 break;
3913
3914 case OP_ANYNL:
3915 switch(fc)
3916 {
3917 default: RRETURN(MATCH_NOMATCH);
3918
3919 case CHAR_CR:
3920 if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
3921 break;
3922
3923 case CHAR_LF:
3924 break;
3925
3926 case CHAR_VT:
3927 case CHAR_FF:
3928 case CHAR_NEL:
3929#if PCRE2_CODE_UNIT_WIDTH != 8
3930 case 0x2028:
3931 case 0x2029:
3932#endif
3933 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
3934 RRETURN(MATCH_NOMATCH);
3935 break;
3936 }
3937 break;
3938
3939 case OP_NOT_HSPACE:
3940 switch(fc)
3941 {
3942 default: break;
3943 HSPACE_BYTE_CASES:
3944#if PCRE2_CODE_UNIT_WIDTH != 8
3945 HSPACE_MULTIBYTE_CASES:
3946#endif
3947 RRETURN(MATCH_NOMATCH);
3948 }
3949 break;
3950
3951 case OP_HSPACE:
3952 switch(fc)
3953 {
3954 default: RRETURN(MATCH_NOMATCH);
3955 HSPACE_BYTE_CASES:
3956#if PCRE2_CODE_UNIT_WIDTH != 8
3957 HSPACE_MULTIBYTE_CASES:
3958#endif
3959 break;
3960 }
3961 break;
3962
3963 case OP_NOT_VSPACE:
3964 switch(fc)
3965 {
3966 default: break;
3967 VSPACE_BYTE_CASES:
3968#if PCRE2_CODE_UNIT_WIDTH != 8
3969 VSPACE_MULTIBYTE_CASES:
3970#endif
3971 RRETURN(MATCH_NOMATCH);
3972 }
3973 break;
3974
3975 case OP_VSPACE:
3976 switch(fc)
3977 {
3978 default: RRETURN(MATCH_NOMATCH);
3979 VSPACE_BYTE_CASES:
3980#if PCRE2_CODE_UNIT_WIDTH != 8
3981 VSPACE_MULTIBYTE_CASES:
3982#endif
3983 break;
3984 }
3985 break;
3986
3987 case OP_NOT_DIGIT:
3988 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
3989 RRETURN(MATCH_NOMATCH);
3990 break;
3991
3992 case OP_DIGIT:
3993 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
3994 RRETURN(MATCH_NOMATCH);
3995 break;
3996
3997 case OP_NOT_WHITESPACE:
3998 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
3999 RRETURN(MATCH_NOMATCH);
4000 break;
4001
4002 case OP_WHITESPACE:
4003 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
4004 RRETURN(MATCH_NOMATCH);
4005 break;
4006
4007 case OP_NOT_WORDCHAR:
4008 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
4009 RRETURN(MATCH_NOMATCH);
4010 break;
4011
4012 case OP_WORDCHAR:
4013 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
4014 RRETURN(MATCH_NOMATCH);
4015 break;
4016
4017 default:
4018 return PCRE2_ERROR_INTERNAL;
4019 }
4020 }
4021 }
4022 /* Control never gets here */
4023 }
4024
4025 /* If maximizing, it is worth using inline code for speed, doing the type
4026 test once at the start (i.e. keep it out of the loops). Once again,
4027 "notmatch" can be an ordinary local variable because the loops do not call
4028 RMATCH. */
4029
4030 else
4031 {
4032 Lstart_eptr = Feptr; /* Remember where we started */
4033
4034#ifdef SUPPORT_UNICODE
4035 if (proptype >= 0)
4036 {
4037 BOOL notmatch = Lctype == OP_NOTPROP;
4038 switch(proptype)
4039 {
4040 case PT_ANY:
4041 for (i = Lmin; i < Lmax; i++)
4042 {
4043 int len = 1;
4044 if (Feptr >= mb->end_subject)
4045 {
4046 SCHECK_PARTIAL();
4047 break;
4048 }
4049 GETCHARLENTEST(fc, Feptr, len);
4050 if (notmatch) break;
4051 Feptr+= len;
4052 }
4053 break;
4054
4055 case PT_LAMP:
4056 for (i = Lmin; i < Lmax; i++)
4057 {
4058 int chartype;
4059 int len = 1;
4060 if (Feptr >= mb->end_subject)
4061 {
4062 SCHECK_PARTIAL();
4063 break;
4064 }
4065 GETCHARLENTEST(fc, Feptr, len);
4066 chartype = UCD_CHARTYPE(fc);
4067 if ((chartype == ucp_Lu ||
4068 chartype == ucp_Ll ||
4069 chartype == ucp_Lt) == notmatch)
4070 break;
4071 Feptr+= len;
4072 }
4073 break;
4074
4075 case PT_GC:
4076 for (i = Lmin; i < Lmax; i++)
4077 {
4078 int len = 1;
4079 if (Feptr >= mb->end_subject)
4080 {
4081 SCHECK_PARTIAL();
4082 break;
4083 }
4084 GETCHARLENTEST(fc, Feptr, len);
4085 if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch) break;
4086 Feptr+= len;
4087 }
4088 break;
4089
4090 case PT_PC:
4091 for (i = Lmin; i < Lmax; i++)
4092 {
4093 int len = 1;
4094 if (Feptr >= mb->end_subject)
4095 {
4096 SCHECK_PARTIAL();
4097 break;
4098 }
4099 GETCHARLENTEST(fc, Feptr, len);
4100 if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch) break;
4101 Feptr+= len;
4102 }
4103 break;
4104
4105 case PT_SC:
4106 for (i = Lmin; i < Lmax; i++)
4107 {
4108 int len = 1;
4109 if (Feptr >= mb->end_subject)
4110 {
4111 SCHECK_PARTIAL();
4112 break;
4113 }
4114 GETCHARLENTEST(fc, Feptr, len);
4115 if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch) break;
4116 Feptr+= len;
4117 }
4118 break;
4119
4120 case PT_SCX:
4121 for (i = Lmin; i < Lmax; i++)
4122 {
4123 BOOL ok;
4124 const ucd_record *prop;
4125 int len = 1;
4126 if (Feptr >= mb->end_subject)
4127 {
4128 SCHECK_PARTIAL();
4129 break;
4130 }
4131 GETCHARLENTEST(fc, Feptr, len);
4132 prop = GET_UCD(fc);
4133 ok = (prop->script == Lpropvalue ||
4134 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);
4135 if (ok == notmatch) break;
4136 Feptr+= len;
4137 }
4138 break;
4139
4140 case PT_ALNUM:
4141 for (i = Lmin; i < Lmax; i++)
4142 {
4143 int category;
4144 int len = 1;
4145 if (Feptr >= mb->end_subject)
4146 {
4147 SCHECK_PARTIAL();
4148 break;
4149 }
4150 GETCHARLENTEST(fc, Feptr, len);
4151 category = UCD_CATEGORY(fc);
4152 if ((category == ucp_L || category == ucp_N) == notmatch)
4153 break;
4154 Feptr+= len;
4155 }
4156 break;
4157
4158 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4159 which means that Perl space and POSIX space are now identical. PCRE
4160 was changed at release 8.34. */
4161
4162 case PT_SPACE: /* Perl space */
4163 case PT_PXSPACE: /* POSIX space */
4164 for (i = Lmin; i < Lmax; i++)
4165 {
4166 int len = 1;
4167 if (Feptr >= mb->end_subject)
4168 {
4169 SCHECK_PARTIAL();
4170 break;
4171 }
4172 GETCHARLENTEST(fc, Feptr, len);
4173 switch(fc)
4174 {
4175 HSPACE_CASES:
4176 VSPACE_CASES:
4177 if (notmatch) goto ENDLOOP99; /* Break the loop */
4178 break;
4179
4180 default:
4181 if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch)
4182 goto ENDLOOP99; /* Break the loop */
4183 break;
4184 }
4185 Feptr+= len;
4186 }
4187 ENDLOOP99:
4188 break;
4189
4190 case PT_WORD:
4191 for (i = Lmin; i < Lmax; i++)
4192 {
4193 int category;
4194 int len = 1;
4195 if (Feptr >= mb->end_subject)
4196 {
4197 SCHECK_PARTIAL();
4198 break;
4199 }
4200 GETCHARLENTEST(fc, Feptr, len);
4201 category = UCD_CATEGORY(fc);
4202 if ((category == ucp_L || category == ucp_N ||
4203 fc == CHAR_UNDERSCORE) == notmatch)
4204 break;
4205 Feptr+= len;
4206 }
4207 break;
4208
4209 case PT_CLIST:
4210 for (i = Lmin; i < Lmax; i++)
4211 {
4212 const uint32_t *cp;
4213 int len = 1;
4214 if (Feptr >= mb->end_subject)
4215 {
4216 SCHECK_PARTIAL();
4217 break;
4218 }
4219 GETCHARLENTEST(fc, Feptr, len);
4220 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
4221 for (;;)
4222 {
4223 if (fc < *cp)
4224 { if (notmatch) break; else goto GOT_MAX; }
4225 if (fc == *cp++)
4226 { if (notmatch) goto GOT_MAX; else break; }
4227 }
4228 Feptr += len;
4229 }
4230 GOT_MAX:
4231 break;
4232
4233 case PT_UCNC:
4234 for (i = Lmin; i < Lmax; i++)
4235 {
4236 int len = 1;
4237 if (Feptr >= mb->end_subject)
4238 {
4239 SCHECK_PARTIAL();
4240 break;
4241 }
4242 GETCHARLENTEST(fc, Feptr, len);
4243 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
4244 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
4245 fc >= 0xe000) == notmatch)
4246 break;
4247 Feptr += len;
4248 }
4249 break;
4250
4251 case PT_BIDICL:
4252 for (i = Lmin; i < Lmax; i++)
4253 {
4254 int len = 1;
4255 if (Feptr >= mb->end_subject)
4256 {
4257 SCHECK_PARTIAL();
4258 break;
4259 }
4260 GETCHARLENTEST(fc, Feptr, len);
4261 if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch) break;
4262 Feptr+= len;
4263 }
4264 break;
4265
4266 case PT_BOOL:
4267 for (i = Lmin; i < Lmax; i++)
4268 {
4269 BOOL ok;
4270 const ucd_record *prop;
4271 int len = 1;
4272 if (Feptr >= mb->end_subject)
4273 {
4274 SCHECK_PARTIAL();
4275 break;
4276 }
4277 GETCHARLENTEST(fc, Feptr, len);
4278 prop = GET_UCD(fc);
4279 ok = MAPBIT(PRIV(ucd_boolprop_sets) +
4280 UCD_BPROPS_PROP(prop), Lpropvalue) != 0;
4281 if (ok == notmatch) break;
4282 Feptr+= len;
4283 }
4284 break;
4285
4286 default:
4287 return PCRE2_ERROR_INTERNAL;
4288 }
4289
4290 /* Feptr is now past the end of the maximum run */
4291
4292 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4293
4294 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
4295 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
4296 go too far. */
4297
4298 for(;;)
4299 {
4300 if (Feptr <= Lstart_eptr) break;
4301 RMATCH(Fecode, RM222);
4302 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4303 Feptr--;
4304 if (utf) BACKCHAR(Feptr);
4305 }
4306 }
4307
4308 /* Match extended Unicode grapheme clusters. We will get here only if the
4309 support is in the binary; otherwise a compile-time error occurs. */
4310
4311 else if (Lctype == OP_EXTUNI)
4312 {
4313 for (i = Lmin; i < Lmax; i++)
4314 {
4315 if (Feptr >= mb->end_subject)
4316 {
4317 SCHECK_PARTIAL();
4318 break;
4319 }
4320 else
4321 {
4322 GETCHARINCTEST(fc, Feptr);
4323 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
4324 utf, NULL);
4325 }
4326 CHECK_PARTIAL();
4327 }
4328
4329 /* Feptr is now past the end of the maximum run */
4330
4331 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4332
4333 /* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start
4334 of the run while backtracking because the use of \C in UTF mode can
4335 cause BACKCHAR to move back past Lstart_eptr. This is just palliative;
4336 the use of \C in UTF mode is fraught with danger. */
4337
4338 for(;;)
4339 {
4340 int lgb, rgb;
4341 PCRE2_SPTR fptr;
4342
4343 if (Feptr <= Lstart_eptr) break; /* At start of char run */
4344 RMATCH(Fecode, RM220);
4345 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4346
4347 /* Backtracking over an extended grapheme cluster involves inspecting
4348 the previous two characters (if present) to see if a break is
4349 permitted between them. */
4350
4351 Feptr--;
4352 if (!utf) fc = *Feptr; else
4353 {
4354 BACKCHAR(Feptr);
4355 GETCHAR(fc, Feptr);
4356 }
4357 rgb = UCD_GRAPHBREAK(fc);
4358
4359 for (;;)
4360 {
4361 if (Feptr <= Lstart_eptr) break; /* At start of char run */
4362 fptr = Feptr - 1;
4363 if (!utf) fc = *fptr; else
4364 {
4365 BACKCHAR(fptr);
4366 GETCHAR(fc, fptr);
4367 }
4368 lgb = UCD_GRAPHBREAK(fc);
4369 if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
4370 Feptr = fptr;
4371 rgb = lgb;
4372 }
4373 }
4374 }
4375
4376 else
4377#endif /* SUPPORT_UNICODE */
4378
4379#ifdef SUPPORT_UNICODE
4380 if (utf)
4381 {
4382 switch(Lctype)
4383 {
4384 case OP_ANY:
4385 for (i = Lmin; i < Lmax; i++)
4386 {
4387 if (Feptr >= mb->end_subject)
4388 {
4389 SCHECK_PARTIAL();
4390 break;
4391 }
4392 if (IS_NEWLINE(Feptr)) break;
4393 if (mb->partial != 0 && /* Take care with CRLF partial */
4394 Feptr + 1 >= mb->end_subject &&
4395 NLBLOCK->nltype == NLTYPE_FIXED &&
4396 NLBLOCK->nllen == 2 &&
4397 UCHAR21(Feptr) == NLBLOCK->nl[0])
4398 {
4399 mb->hitend = TRUE;
4400 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4401 }
4402 Feptr++;
4403 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
4404 }
4405 break;
4406
4407 case OP_ALLANY:
4408 if (Lmax < UINT32_MAX)
4409 {
4410 for (i = Lmin; i < Lmax; i++)
4411 {
4412 if (Feptr >= mb->end_subject)
4413 {
4414 SCHECK_PARTIAL();
4415 break;
4416 }
4417 Feptr++;
4418 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
4419 }
4420 }
4421 else
4422 {
4423 Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */
4424 SCHECK_PARTIAL();
4425 }
4426 break;
4427
4428 /* The "byte" (i.e. "code unit") case is the same as non-UTF */
4429
4430 case OP_ANYBYTE:
4431 fc = Lmax - Lmin;
4432 if (fc > (uint32_t)(mb->end_subject - Feptr))
4433 {
4434 Feptr = mb->end_subject;
4435 SCHECK_PARTIAL();
4436 }
4437 else Feptr += fc;
4438 break;
4439
4440 case OP_ANYNL:
4441 for (i = Lmin; i < Lmax; i++)
4442 {
4443 int len = 1;
4444 if (Feptr >= mb->end_subject)
4445 {
4446 SCHECK_PARTIAL();
4447 break;
4448 }
4449 GETCHARLEN(fc, Feptr, len);
4450 if (fc == CHAR_CR)
4451 {
4452 if (++Feptr >= mb->end_subject) break;
4453 if (UCHAR21(Feptr) == CHAR_LF) Feptr++;
4454 }
4455 else
4456 {
4457 if (fc != CHAR_LF &&
4458 (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
4459 (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
4460#ifndef EBCDIC
4461 && fc != 0x2028 && fc != 0x2029
4462#endif /* Not EBCDIC */
4463 )))
4464 break;
4465 Feptr += len;
4466 }
4467 }
4468 break;
4469
4470 case OP_NOT_HSPACE:
4471 case OP_HSPACE:
4472 for (i = Lmin; i < Lmax; i++)
4473 {
4474 BOOL gotspace;
4475 int len = 1;
4476 if (Feptr >= mb->end_subject)
4477 {
4478 SCHECK_PARTIAL();
4479 break;
4480 }
4481 GETCHARLEN(fc, Feptr, len);
4482 switch(fc)
4483 {
4484 HSPACE_CASES: gotspace = TRUE; break;
4485 default: gotspace = FALSE; break;
4486 }
4487 if (gotspace == (Lctype == OP_NOT_HSPACE)) break;
4488 Feptr += len;
4489 }
4490 break;
4491
4492 case OP_NOT_VSPACE:
4493 case OP_VSPACE:
4494 for (i = Lmin; i < Lmax; i++)
4495 {
4496 BOOL gotspace;
4497 int len = 1;
4498 if (Feptr >= mb->end_subject)
4499 {
4500 SCHECK_PARTIAL();
4501 break;
4502 }
4503 GETCHARLEN(fc, Feptr, len);
4504 switch(fc)
4505 {
4506 VSPACE_CASES: gotspace = TRUE; break;
4507 default: gotspace = FALSE; break;
4508 }
4509 if (gotspace == (Lctype == OP_NOT_VSPACE)) break;
4510 Feptr += len;
4511 }
4512 break;
4513
4514 case OP_NOT_DIGIT:
4515 for (i = Lmin; i < Lmax; i++)
4516 {
4517 int len = 1;
4518 if (Feptr >= mb->end_subject)
4519 {
4520 SCHECK_PARTIAL();
4521 break;
4522 }
4523 GETCHARLEN(fc, Feptr, len);
4524 if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break;
4525 Feptr+= len;
4526 }
4527 break;
4528
4529 case OP_DIGIT:
4530 for (i = Lmin; i < Lmax; i++)
4531 {
4532 int len = 1;
4533 if (Feptr >= mb->end_subject)
4534 {
4535 SCHECK_PARTIAL();
4536 break;
4537 }
4538 GETCHARLEN(fc, Feptr, len);
4539 if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break;
4540 Feptr+= len;
4541 }
4542 break;
4543
4544 case OP_NOT_WHITESPACE:
4545 for (i = Lmin; i < Lmax; i++)
4546 {
4547 int len = 1;
4548 if (Feptr >= mb->end_subject)
4549 {
4550 SCHECK_PARTIAL();
4551 break;
4552 }
4553 GETCHARLEN(fc, Feptr, len);
4554 if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break;
4555 Feptr+= len;
4556 }
4557 break;
4558
4559 case OP_WHITESPACE:
4560 for (i = Lmin; i < Lmax; i++)
4561 {
4562 int len = 1;
4563 if (Feptr >= mb->end_subject)
4564 {
4565 SCHECK_PARTIAL();
4566 break;
4567 }
4568 GETCHARLEN(fc, Feptr, len);
4569 if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break;
4570 Feptr+= len;
4571 }
4572 break;
4573
4574 case OP_NOT_WORDCHAR:
4575 for (i = Lmin; i < Lmax; i++)
4576 {
4577 int len = 1;
4578 if (Feptr >= mb->end_subject)
4579 {
4580 SCHECK_PARTIAL();
4581 break;
4582 }
4583 GETCHARLEN(fc, Feptr, len);
4584 if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break;
4585 Feptr+= len;
4586 }
4587 break;
4588
4589 case OP_WORDCHAR:
4590 for (i = Lmin; i < Lmax; i++)
4591 {
4592 int len = 1;
4593 if (Feptr >= mb->end_subject)
4594 {
4595 SCHECK_PARTIAL();
4596 break;
4597 }
4598 GETCHARLEN(fc, Feptr, len);
4599 if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break;
4600 Feptr+= len;
4601 }
4602 break;
4603
4604 default:
4605 return PCRE2_ERROR_INTERNAL;
4606 }
4607
4608 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4609
4610 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
4611 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go
4612 too far. */
4613
4614 for(;;)
4615 {
4616 if (Feptr <= Lstart_eptr) break;
4617 RMATCH(Fecode, RM221);
4618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4619 Feptr--;
4620 BACKCHAR(Feptr);
4621 if (Lctype == OP_ANYNL && Feptr > Lstart_eptr &&
4622 UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR)
4623 Feptr--;
4624 }
4625 }
4626 else
4627#endif /* SUPPORT_UNICODE */
4628
4629 /* Not UTF mode */
4630 {
4631 switch(Lctype)
4632 {
4633 case OP_ANY:
4634 for (i = Lmin; i < Lmax; i++)
4635 {
4636 if (Feptr >= mb->end_subject)
4637 {
4638 SCHECK_PARTIAL();
4639 break;
4640 }
4641 if (IS_NEWLINE(Feptr)) break;
4642 if (mb->partial != 0 && /* Take care with CRLF partial */
4643 Feptr + 1 >= mb->end_subject &&
4644 NLBLOCK->nltype == NLTYPE_FIXED &&
4645 NLBLOCK->nllen == 2 &&
4646 *Feptr == NLBLOCK->nl[0])
4647 {
4648 mb->hitend = TRUE;
4649 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4650 }
4651 Feptr++;
4652 }
4653 break;
4654
4655 case OP_ALLANY:
4656 case OP_ANYBYTE:
4657 fc = Lmax - Lmin;
4658 if (fc > (uint32_t)(mb->end_subject - Feptr))
4659 {
4660 Feptr = mb->end_subject;
4661 SCHECK_PARTIAL();
4662 }
4663 else Feptr += fc;
4664 break;
4665
4666 case OP_ANYNL:
4667 for (i = Lmin; i < Lmax; i++)
4668 {
4669 if (Feptr >= mb->end_subject)
4670 {
4671 SCHECK_PARTIAL();
4672 break;
4673 }
4674 fc = *Feptr;
4675 if (fc == CHAR_CR)
4676 {
4677 if (++Feptr >= mb->end_subject) break;
4678 if (*Feptr == CHAR_LF) Feptr++;
4679 }
4680 else
4681 {
4682 if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
4683 (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
4684#if PCRE2_CODE_UNIT_WIDTH != 8
4685 && fc != 0x2028 && fc != 0x2029
4686#endif
4687 ))) break;
4688 Feptr++;
4689 }
4690 }
4691 break;
4692
4693 case OP_NOT_HSPACE:
4694 for (i = Lmin; i < Lmax; i++)
4695 {
4696 if (Feptr >= mb->end_subject)
4697 {
4698 SCHECK_PARTIAL();
4699 break;
4700 }
4701 switch(*Feptr)
4702 {
4703 default: Feptr++; break;
4704 HSPACE_BYTE_CASES:
4705#if PCRE2_CODE_UNIT_WIDTH != 8
4706 HSPACE_MULTIBYTE_CASES:
4707#endif
4708 goto ENDLOOP00;
4709 }
4710 }
4711 ENDLOOP00:
4712 break;
4713
4714 case OP_HSPACE:
4715 for (i = Lmin; i < Lmax; i++)
4716 {
4717 if (Feptr >= mb->end_subject)
4718 {
4719 SCHECK_PARTIAL();
4720 break;
4721 }
4722 switch(*Feptr)
4723 {
4724 default: goto ENDLOOP01;
4725 HSPACE_BYTE_CASES:
4726#if PCRE2_CODE_UNIT_WIDTH != 8
4727 HSPACE_MULTIBYTE_CASES:
4728#endif
4729 Feptr++; break;
4730 }
4731 }
4732 ENDLOOP01:
4733 break;
4734
4735 case OP_NOT_VSPACE:
4736 for (i = Lmin; i < Lmax; i++)
4737 {
4738 if (Feptr >= mb->end_subject)
4739 {
4740 SCHECK_PARTIAL();
4741 break;
4742 }
4743 switch(*Feptr)
4744 {
4745 default: Feptr++; break;
4746 VSPACE_BYTE_CASES:
4747#if PCRE2_CODE_UNIT_WIDTH != 8
4748 VSPACE_MULTIBYTE_CASES:
4749#endif
4750 goto ENDLOOP02;
4751 }
4752 }
4753 ENDLOOP02:
4754 break;
4755
4756 case OP_VSPACE:
4757 for (i = Lmin; i < Lmax; i++)
4758 {
4759 if (Feptr >= mb->end_subject)
4760 {
4761 SCHECK_PARTIAL();
4762 break;
4763 }
4764 switch(*Feptr)
4765 {
4766 default: goto ENDLOOP03;
4767 VSPACE_BYTE_CASES:
4768#if PCRE2_CODE_UNIT_WIDTH != 8
4769 VSPACE_MULTIBYTE_CASES:
4770#endif
4771 Feptr++; break;
4772 }
4773 }
4774 ENDLOOP03:
4775 break;
4776
4777 case OP_NOT_DIGIT:
4778 for (i = Lmin; i < Lmax; i++)
4779 {
4780 if (Feptr >= mb->end_subject)
4781 {
4782 SCHECK_PARTIAL();
4783 break;
4784 }
4785 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
4786 break;
4787 Feptr++;
4788 }
4789 break;
4790
4791 case OP_DIGIT:
4792 for (i = Lmin; i < Lmax; i++)
4793 {
4794 if (Feptr >= mb->end_subject)
4795 {
4796 SCHECK_PARTIAL();
4797 break;
4798 }
4799 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
4800 break;
4801 Feptr++;
4802 }
4803 break;
4804
4805 case OP_NOT_WHITESPACE:
4806 for (i = Lmin; i < Lmax; i++)
4807 {
4808 if (Feptr >= mb->end_subject)
4809 {
4810 SCHECK_PARTIAL();
4811 break;
4812 }
4813 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
4814 break;
4815 Feptr++;
4816 }
4817 break;
4818
4819 case OP_WHITESPACE:
4820 for (i = Lmin; i < Lmax; i++)
4821 {
4822 if (Feptr >= mb->end_subject)
4823 {
4824 SCHECK_PARTIAL();
4825 break;
4826 }
4827 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
4828 break;
4829 Feptr++;
4830 }
4831 break;
4832
4833 case OP_NOT_WORDCHAR:
4834 for (i = Lmin; i < Lmax; i++)
4835 {
4836 if (Feptr >= mb->end_subject)
4837 {
4838 SCHECK_PARTIAL();
4839 break;
4840 }
4841 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
4842 break;
4843 Feptr++;
4844 }
4845 break;
4846
4847 case OP_WORDCHAR:
4848 for (i = Lmin; i < Lmax; i++)
4849 {
4850 if (Feptr >= mb->end_subject)
4851 {
4852 SCHECK_PARTIAL();
4853 break;
4854 }
4855 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
4856 break;
4857 Feptr++;
4858 }
4859 break;
4860
4861 default:
4862 return PCRE2_ERROR_INTERNAL;
4863 }
4864
4865 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4866
4867 for (;;)
4868 {
4869 if (Feptr == Lstart_eptr) break;
4870 RMATCH(Fecode, RM34);
4871 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4872 Feptr--;
4873 if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF &&
4874 Feptr[-1] == CHAR_CR) Feptr--;
4875 }
4876 }
4877 }
4878 break; /* End of repeat character type processing */
4879
4880#undef Lstart_eptr
4881#undef Lmin
4882#undef Lmax
4883#undef Lctype
4884#undef Lpropvalue
4885
4886
4887 /* ===================================================================== */
4888 /* Match a back reference, possibly repeatedly. Look past the end of the
4889 item to see if there is repeat information following. The OP_REF and
4890 OP_REFI opcodes are used for a reference to a numbered group or to a
4891 non-duplicated named group. For a duplicated named group, OP_DNREF and
4892 OP_DNREFI are used. In this case we must scan the list of groups to which
4893 the name refers, and use the first one that is set. */
4894
4895#define Lmin F->temp_32[0]
4896#define Lmax F->temp_32[1]
4897#define Lcaseless F->temp_32[2]
4898#define Lstart F->temp_sptr[0]
4899#define Loffset F->temp_size
4900
4901 case OP_DNREF:
4902 case OP_DNREFI:
4903 Lcaseless = (Fop == OP_DNREFI);
4904 {
4905 int count = GET2(Fecode, 1+IMM2_SIZE);
4906 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
4907 Fecode += 1 + 2*IMM2_SIZE;
4908
4909 while (count-- > 0)
4910 {
4911 Loffset = (GET2(slot, 0) << 1) - 2;
4912 if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break;
4913 slot += mb->name_entry_size;
4914 }
4915 }
4916 goto REF_REPEAT;
4917
4918 case OP_REF:
4919 case OP_REFI:
4920 Lcaseless = (Fop == OP_REFI);
4921 Loffset = (GET2(Fecode, 1) << 1) - 2;
4922 Fecode += 1 + IMM2_SIZE;
4923
4924 /* Set up for repetition, or handle the non-repeated case. The maximum and
4925 minimum must be in the heap frame, but as they are short-term values, we
4926 use temporary fields. */
4927
4928 REF_REPEAT:
4929 switch (*Fecode)
4930 {
4931 case OP_CRSTAR:
4932 case OP_CRMINSTAR:
4933 case OP_CRPLUS:
4934 case OP_CRMINPLUS:
4935 case OP_CRQUERY:
4936 case OP_CRMINQUERY:
4937 fc = *Fecode++ - OP_CRSTAR;
4938 Lmin = rep_min[fc];
4939 Lmax = rep_max[fc];
4940 reptype = rep_typ[fc];
4941 break;
4942
4943 case OP_CRRANGE:
4944 case OP_CRMINRANGE:
4945 Lmin = GET2(Fecode, 1);
4946 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
4947 reptype = rep_typ[*Fecode - OP_CRSTAR];
4948 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
4949 Fecode += 1 + 2 * IMM2_SIZE;
4950 break;
4951
4952 default: /* No repeat follows */
4953 {
4954 rrc = match_ref(Loffset, Lcaseless, F, mb, &length);
4955 if (rrc != 0)
4956 {
4957 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
4958 CHECK_PARTIAL();
4959 RRETURN(MATCH_NOMATCH);
4960 }
4961 }
4962 Feptr += length;
4963 continue; /* With the main loop */
4964 }
4965
4966 /* Handle repeated back references. If a set group has length zero, just
4967 continue with the main loop, because it matches however many times. For an
4968 unset reference, if the minimum is zero, we can also just continue. We can
4969 also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset
4970 group behave as a zero-length group. For any other unset cases, carrying
4971 on will result in NOMATCH. */
4972
4973 if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET)
4974 {
4975 if (Fovector[Loffset] == Fovector[Loffset + 1]) continue;
4976 }
4977 else /* Group is not set */
4978 {
4979 if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
4980 continue;
4981 }
4982
4983 /* First, ensure the minimum number of matches are present. */
4984
4985 for (i = 1; i <= Lmin; i++)
4986 {
4987 PCRE2_SIZE slength;
4988 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
4989 if (rrc != 0)
4990 {
4991 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
4992 CHECK_PARTIAL();
4993 RRETURN(MATCH_NOMATCH);
4994 }
4995 Feptr += slength;
4996 }
4997
4998 /* If min = max, we are done. They are not both allowed to be zero. */
4999
5000 if (Lmin == Lmax) continue;
5001
5002 /* If minimizing, keep trying and advancing the pointer. */
5003
5004 if (reptype == REPTYPE_MIN)
5005 {
5006 for (;;)
5007 {
5008 PCRE2_SIZE slength;
5009 RMATCH(Fecode, RM20);
5010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5011 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
5012 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
5013 if (rrc != 0)
5014 {
5015 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
5016 CHECK_PARTIAL();
5017 RRETURN(MATCH_NOMATCH);
5018 }
5019 Feptr += slength;
5020 }
5021 /* Control never gets here */
5022 }
5023
5024 /* If maximizing, find the longest string and work backwards, as long as
5025 the matched lengths for each iteration are the same. */
5026
5027 else
5028 {
5029 BOOL samelengths = TRUE;
5030 Lstart = Feptr; /* Starting position */
5031 Flength = Fovector[Loffset+1] - Fovector[Loffset];
5032
5033 for (i = Lmin; i < Lmax; i++)
5034 {
5035 PCRE2_SIZE slength;
5036 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
5037 if (rrc != 0)
5038 {
5039 /* Can't use CHECK_PARTIAL because we don't want to update Feptr in
5040 the soft partial matching case. */
5041
5042 if (rrc > 0 && mb->partial != 0 &&
5043 mb->end_subject > mb->start_used_ptr)
5044 {
5045 mb->hitend = TRUE;
5046 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5047 }
5048 break;
5049 }
5050
5051 if (slength != Flength) samelengths = FALSE;
5052 Feptr += slength;
5053 }
5054
5055 /* If the length matched for each repetition is the same as the length of
5056 the captured group, we can easily work backwards. This is the normal
5057 case. However, in caseless UTF-8 mode there are pairs of case-equivalent
5058 characters whose lengths (in terms of code units) differ. However, this
5059 is very rare, so we handle it by re-matching fewer and fewer times. */
5060
5061 if (samelengths)
5062 {
5063 while (Feptr >= Lstart)
5064 {
5065 RMATCH(Fecode, RM21);
5066 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5067 Feptr -= Flength;
5068 }
5069 }
5070
5071 /* The rare case of non-matching lengths. Re-scan the repetition for each
5072 iteration. We know that match_ref() will succeed every time. */
5073
5074 else
5075 {
5076 Lmax = i;
5077 for (;;)
5078 {
5079 RMATCH(Fecode, RM22);
5080 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5081 if (Feptr == Lstart) break; /* Failed after minimal repetition */
5082 Feptr = Lstart;
5083 Lmax--;
5084 for (i = Lmin; i < Lmax; i++)
5085 {
5086 PCRE2_SIZE slength;
5087 (void)match_ref(Loffset, Lcaseless, F, mb, &slength);
5088 Feptr += slength;
5089 }
5090 }
5091 }
5092
5093 RRETURN(MATCH_NOMATCH);
5094 }
5095 /* Control never gets here */
5096
5097#undef Lcaseless
5098#undef Lmin
5099#undef Lmax
5100#undef Lstart
5101#undef Loffset
5102
5103
5104
5105/* ========================================================================= */
5106/* Opcodes for the start of various parenthesized items */
5107/* ========================================================================= */
5108
5109 /* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the
5110 (*THEN) is within the current branch by comparing the address of OP_THEN
5111 that is passed back with the end of the branch. If (*THEN) is within the
5112 current branch, and the branch is one of two or more alternatives (it
5113 either starts or ends with OP_ALT), we have reached the limit of THEN's
5114 action, so convert the return code to NOMATCH, which will cause normal
5115 backtracking to happen from now on. Otherwise, THEN is passed back to an
5116 outer alternative. This implements Perl's treatment of parenthesized
5117 groups, where a group not containing | does not affect the current
5118 alternative, that is, (X) is NOT the same as (X|(*F)). */
5119
5120
5121 /* ===================================================================== */
5122 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive
5123 bracket group, indicating that it may occur zero times. It may repeat
5124 infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in
5125 the pattern. Brackets with fixed upper repeat limits are compiled as a
5126 number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO.
5127 Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */
5128
5129#define Lnext_ecode F->temp_sptr[0]
5130
5131 case OP_BRAZERO:
5132 Lnext_ecode = Fecode + 1;
5133 RMATCH(Lnext_ecode, RM9);
5134 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5135 do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
5136 Fecode = Lnext_ecode + 1 + LINK_SIZE;
5137 break;
5138
5139 case OP_BRAMINZERO:
5140 Lnext_ecode = Fecode + 1;
5141 do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
5142 RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10);
5143 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5144 Fecode++;
5145 break;
5146
5147#undef Lnext_ecode
5148
5149 case OP_SKIPZERO:
5150 Fecode++;
5151 do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
5152 Fecode += 1 + LINK_SIZE;
5153 break;
5154
5155
5156 /* ===================================================================== */
5157 /* Handle possessive brackets with an unlimited repeat. The end of these
5158 brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without
5159 going further in the pattern. */
5160
5161#define Lframe_type F->temp_32[0]
5162#define Lmatched_once F->temp_32[1]
5163#define Lzero_allowed F->temp_32[2]
5164#define Lstart_eptr F->temp_sptr[0]
5165#define Lstart_group F->temp_sptr[1]
5166
5167 case OP_BRAPOSZERO:
5168 Lzero_allowed = TRUE; /* Zero repeat is allowed */
5169 Fecode += 1;
5170 if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS)
5171 goto POSSESSIVE_CAPTURE;
5172 goto POSSESSIVE_NON_CAPTURE;
5173
5174 case OP_BRAPOS:
5175 case OP_SBRAPOS:
5176 Lzero_allowed = FALSE; /* Zero repeat not allowed */
5177
5178 POSSESSIVE_NON_CAPTURE:
5179 Lframe_type = GF_NOCAPTURE; /* Remembered frame type */
5180 goto POSSESSIVE_GROUP;
5181
5182 case OP_CBRAPOS:
5183 case OP_SCBRAPOS:
5184 Lzero_allowed = FALSE; /* Zero repeat not allowed */
5185
5186 POSSESSIVE_CAPTURE:
5187 number = GET2(Fecode, 1+LINK_SIZE);
5188 Lframe_type = GF_CAPTURE | number; /* Remembered frame type */
5189
5190 POSSESSIVE_GROUP:
5191 Lmatched_once = FALSE; /* Never matched */
5192 Lstart_group = Fecode; /* Start of this group */
5193
5194 for (;;)
5195 {
5196 Lstart_eptr = Feptr; /* Position at group start */
5197 group_frame_type = Lframe_type;
5198 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8);
5199 if (rrc == MATCH_KETRPOS)
5200 {
5201 Lmatched_once = TRUE; /* Matched at least once */
5202 if (Feptr == Lstart_eptr) /* Empty match; skip to end */
5203 {
5204 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5205 break;
5206 }
5207
5208 Fecode = Lstart_group;
5209 continue;
5210 }
5211
5212 /* See comment above about handling THEN. */
5213
5214 if (rrc == MATCH_THEN)
5215 {
5216 PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
5217 if (mb->verb_ecode_ptr < next_ecode &&
5218 (*Fecode == OP_ALT || *next_ecode == OP_ALT))
5219 rrc = MATCH_NOMATCH;
5220 }
5221
5222 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5223 Fecode += GET(Fecode, 1);
5224 if (*Fecode != OP_ALT) break;
5225 }
5226
5227 /* Success if matched something or zero repeat allowed */
5228
5229 if (Lmatched_once || Lzero_allowed)
5230 {
5231 Fecode += 1 + LINK_SIZE;
5232 break;
5233 }
5234
5235 RRETURN(MATCH_NOMATCH);
5236
5237#undef Lmatched_once
5238#undef Lzero_allowed
5239#undef Lframe_type
5240#undef Lstart_eptr
5241#undef Lstart_group
5242
5243
5244 /* ===================================================================== */
5245 /* Handle non-capturing brackets that cannot match an empty string. When we
5246 get to the final alternative within the brackets, as long as there are no
5247 THEN's in the pattern, we can optimize by not recording a new backtracking
5248 point. (Ideally we should test for a THEN within this group, but we don't
5249 have that information.) Don't do this if we are at the very top level,
5250 however, because that would make handling assertions and once-only brackets
5251 messier when there is nothing to go back to. */
5252
5253#define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */
5254#define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */
5255
5256 case OP_BRA:
5257 if (mb->hasthen || Frdepth == 0)
5258 {
5259 Lframe_type = 0;
5260 goto GROUPLOOP;
5261 }
5262
5263 for (;;)
5264 {
5265 Lnext_branch = Fecode + GET(Fecode, 1);
5266 if (*Lnext_branch != OP_ALT) break;
5267
5268 /* This is never the final branch. We do not need to test for MATCH_THEN
5269 here because this code is not used when there is a THEN in the pattern. */
5270
5271 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1);
5272 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5273 Fecode = Lnext_branch;
5274 }
5275
5276 /* Hit the start of the final branch. Continue at this level. */
5277
5278 Fecode += PRIV(OP_lengths)[*Fecode];
5279 break;
5280
5281#undef Lnext_branch
5282
5283
5284 /* ===================================================================== */
5285 /* Handle a capturing bracket, other than those that are possessive with an
5286 unlimited repeat. */
5287
5288 case OP_CBRA:
5289 case OP_SCBRA:
5290 Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE);
5291 goto GROUPLOOP;
5292
5293
5294 /* ===================================================================== */
5295 /* Atomic groups and non-capturing brackets that can match an empty string
5296 must record a backtracking point and also set up a chained frame. */
5297
5298 case OP_ONCE:
5299 case OP_SCRIPT_RUN:
5300 case OP_SBRA:
5301 Lframe_type = GF_NOCAPTURE | Fop;
5302
5303 GROUPLOOP:
5304 for (;;)
5305 {
5306 group_frame_type = Lframe_type;
5307 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2);
5308 if (rrc == MATCH_THEN)
5309 {
5310 PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
5311 if (mb->verb_ecode_ptr < next_ecode &&
5312 (*Fecode == OP_ALT || *next_ecode == OP_ALT))
5313 rrc = MATCH_NOMATCH;
5314 }
5315 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5316 Fecode += GET(Fecode, 1);
5317 if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
5318 }
5319 /* Control never reaches here. */
5320
5321#undef Lframe_type
5322
5323
5324 /* ===================================================================== */
5325 /* Recursion either matches the current regex, or some subexpression. The
5326 offset data is the offset to the starting bracket from the start of the
5327 whole pattern. (This is so that it works from duplicated subpatterns.) */
5328
5329#define Lframe_type F->temp_32[0]
5330#define Lstart_branch F->temp_sptr[0]
5331
5332 case OP_RECURSE:
5333 bracode = mb->start_code + GET(Fecode, 1);
5334 number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE);
5335
5336 /* If we are already in a recursion, check for repeating the same one
5337 without advancing the subject pointer. This should catch convoluted mutual
5338 recursions. (Some simple cases are caught at compile time.) */
5339
5340 if (Fcurrent_recurse != RECURSE_UNSET)
5341 {
5342 offset = Flast_group_offset;
5343 while (offset != PCRE2_UNSET)
5344 {
5345 N = (heapframe *)((char *)match_data->heapframes + offset);
5346 P = (heapframe *)((char *)N - frame_size);
5347 if (N->group_frame_type == (GF_RECURSE | number))
5348 {
5349 if (Feptr == P->eptr) return PCRE2_ERROR_RECURSELOOP;
5350 break;
5351 }
5352 offset = P->last_group_offset;
5353 }
5354 }
5355
5356 /* Now run the recursion, branch by branch. */
5357
5358 Lstart_branch = bracode;
5359 Lframe_type = GF_RECURSE | number;
5360
5361 for (;;)
5362 {
5363 PCRE2_SPTR next_ecode;
5364
5365 group_frame_type = Lframe_type;
5366 RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11);
5367 next_ecode = Lstart_branch + GET(Lstart_branch,1);
5368
5369 /* Handle backtracking verbs, which are defined in a range that can
5370 easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to
5371 escape beyond a recursion; they cause a NOMATCH for the entire recursion.
5372
5373 When one of these verbs triggers, the current recursion group number is
5374 recorded. If it matches the recursion we are processing, the verb
5375 happened within the recursion and we must deal with it. Otherwise it must
5376 have happened after the recursion completed, and so has to be passed
5377 back. See comment above about handling THEN. */
5378
5379 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX &&
5380 mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE))
5381 {
5382 if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode &&
5383 (*Lstart_branch == OP_ALT || *next_ecode == OP_ALT))
5384 rrc = MATCH_NOMATCH;
5385 else RRETURN(MATCH_NOMATCH);
5386 }
5387
5388 /* Note that carrying on after (*ACCEPT) in a recursion is handled in the
5389 OP_ACCEPT code. Nothing needs to be done here. */
5390
5391 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5392 Lstart_branch = next_ecode;
5393 if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH);
5394 }
5395 /* Control never reaches here. */
5396
5397#undef Lframe_type
5398#undef Lstart_branch
5399
5400
5401 /* ===================================================================== */
5402 /* Positive assertions are like other groups except that PCRE doesn't allow
5403 the effect of (*THEN) to escape beyond an assertion; it is therefore
5404 treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its
5405 captures and mark retained. Any other return is an error. */
5406
5407#define Lframe_type F->temp_32[0]
5408
5409 case OP_ASSERT:
5410 case OP_ASSERTBACK:
5411 case OP_ASSERT_NA:
5412 case OP_ASSERTBACK_NA:
5413 Lframe_type = GF_NOCAPTURE | Fop;
5414 for (;;)
5415 {
5416 group_frame_type = Lframe_type;
5417 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3);
5418 if (rrc == MATCH_ACCEPT)
5419 {
5420 memcpy(Fovector,
5421 (char *)assert_accept_frame + offsetof(heapframe, ovector),
5422 assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5423 Foffset_top = assert_accept_frame->offset_top;
5424 Fmark = assert_accept_frame->mark;
5425 break;
5426 }
5427 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
5428 Fecode += GET(Fecode, 1);
5429 if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
5430 }
5431
5432 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5433 Fecode += 1 + LINK_SIZE;
5434 break;
5435
5436#undef Lframe_type
5437
5438
5439 /* ===================================================================== */
5440 /* Handle negative assertions. Loop for each non-matching branch as for
5441 positive assertions. */
5442
5443#define Lframe_type F->temp_32[0]
5444
5445 case OP_ASSERT_NOT:
5446 case OP_ASSERTBACK_NOT:
5447 Lframe_type = GF_NOCAPTURE | Fop;
5448
5449 for (;;)
5450 {
5451 group_frame_type = Lframe_type;
5452 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4);
5453 switch(rrc)
5454 {
5455 case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */
5456 case MATCH_MATCH:
5457 RRETURN (MATCH_NOMATCH);
5458
5459 case MATCH_NOMATCH: /* Branch failed, try next if present. */
5460 case MATCH_THEN:
5461 Fecode += GET(Fecode, 1);
5462 if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED;
5463 break;
5464
5465 case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */
5466 case MATCH_SKIP:
5467 case MATCH_PRUNE:
5468 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5469 goto ASSERT_NOT_FAILED;
5470
5471 default: /* Pass back any other return */
5472 RRETURN(rrc);
5473 }
5474 }
5475
5476 /* None of the branches have matched or there was a backtrack to (*COMMIT),
5477 (*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a
5478 negative assertion, so carry on. */
5479
5480 ASSERT_NOT_FAILED:
5481 Fecode += 1 + LINK_SIZE;
5482 break;
5483
5484#undef Lframe_type
5485
5486
5487 /* ===================================================================== */
5488 /* The callout item calls an external function, if one is provided, passing
5489 details of the match so far. This is mainly for debugging, though the
5490 function is able to force a failure. */
5491
5492 case OP_CALLOUT:
5493 case OP_CALLOUT_STR:
5494 rrc = do_callout(F, mb, &length);
5495 if (rrc > 0) RRETURN(MATCH_NOMATCH);
5496 if (rrc < 0) RRETURN(rrc);
5497 Fecode += length;
5498 break;
5499
5500
5501 /* ===================================================================== */
5502 /* Conditional group: compilation checked that there are no more than two
5503 branches. If the condition is false, skipping the first branch takes us
5504 past the end of the item if there is only one branch, but that's exactly
5505 what we want. */
5506
5507 case OP_COND:
5508 case OP_SCOND:
5509
5510 /* The variable Flength will be added to Fecode when the condition is
5511 false, to get to the second branch. Setting it to the offset to the ALT or
5512 KET, then incrementing Fecode achieves this effect. However, if the second
5513 branch is non-existent, we must point to the KET so that the end of the
5514 group is correctly processed. We now have Fecode pointing to the condition
5515 or callout. */
5516
5517 Flength = GET(Fecode, 1); /* Offset to the second branch */
5518 if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE;
5519 Fecode += 1 + LINK_SIZE; /* From this opcode */
5520
5521 /* Because of the way auto-callout works during compile, a callout item is
5522 inserted between OP_COND and an assertion condition. Such a callout can
5523 also be inserted manually. */
5524
5525 if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR)
5526 {
5527 rrc = do_callout(F, mb, &length);
5528 if (rrc > 0) RRETURN(MATCH_NOMATCH);
5529 if (rrc < 0) RRETURN(rrc);
5530
5531 /* Advance Fecode past the callout, so it now points to the condition. We
5532 must adjust Flength so that the value of Fecode+Flength is unchanged. */
5533
5534 Fecode += length;
5535 Flength -= length;
5536 }
5537
5538 /* Test the various possible conditions */
5539
5540 condition = FALSE;
5541 switch(*Fecode)
5542 {
5543 case OP_RREF: /* Group recursion test */
5544 if (Fcurrent_recurse != RECURSE_UNSET)
5545 {
5546 number = GET2(Fecode, 1);
5547 condition = (number == RREF_ANY || number == Fcurrent_recurse);
5548 }
5549 break;
5550
5551 case OP_DNRREF: /* Duplicate named group recursion test */
5552 if (Fcurrent_recurse != RECURSE_UNSET)
5553 {
5554 int count = GET2(Fecode, 1 + IMM2_SIZE);
5555 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5556 while (count-- > 0)
5557 {
5558 number = GET2(slot, 0);
5559 condition = number == Fcurrent_recurse;
5560 if (condition) break;
5561 slot += mb->name_entry_size;
5562 }
5563 }
5564 break;
5565
5566 case OP_CREF: /* Numbered group used test */
5567 offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */
5568 condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
5569 break;
5570
5571 case OP_DNCREF: /* Duplicate named group used test */
5572 {
5573 int count = GET2(Fecode, 1 + IMM2_SIZE);
5574 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5575 while (count-- > 0)
5576 {
5577 offset = (GET2(slot, 0) << 1) - 2;
5578 condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
5579 if (condition) break;
5580 slot += mb->name_entry_size;
5581 }
5582 }
5583 break;
5584
5585 case OP_FALSE:
5586 case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */
5587 break;
5588
5589 case OP_TRUE:
5590 condition = TRUE;
5591 break;
5592
5593 /* The condition is an assertion. Run code similar to the assertion code
5594 above. */
5595
5596#define Lpositive F->temp_32[0]
5597#define Lstart_branch F->temp_sptr[0]
5598
5599 default:
5600 Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK);
5601 Lstart_branch = Fecode;
5602
5603 for (;;)
5604 {
5605 group_frame_type = GF_CONDASSERT | *Fecode;
5606 RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5);
5607
5608 switch(rrc)
5609 {
5610 case MATCH_ACCEPT: /* Save captures */
5611 memcpy(Fovector,
5612 (char *)assert_accept_frame + offsetof(heapframe, ovector),
5613 assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5614 Foffset_top = assert_accept_frame->offset_top;
5615
5616 /* Fall through */
5617 /* In the case of a match, the captures have already been put into
5618 the current frame. */
5619
5620 case MATCH_MATCH:
5621 condition = Lpositive; /* TRUE for positive assertion */
5622 break;
5623
5624 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
5625 assertion; it is therefore always treated as NOMATCH. */
5626
5627 case MATCH_NOMATCH:
5628 case MATCH_THEN:
5629 Lstart_branch += GET(Lstart_branch, 1);
5630 if (*Lstart_branch == OP_ALT) continue; /* Try next branch */
5631 condition = !Lpositive; /* TRUE for negative assertion */
5632 break;
5633
5634 /* These force no match without checking other branches. */
5635
5636 case MATCH_COMMIT:
5637 case MATCH_SKIP:
5638 case MATCH_PRUNE:
5639 condition = !Lpositive;
5640 break;
5641
5642 default:
5643 RRETURN(rrc);
5644 }
5645 break; /* Out of the branch loop */
5646 }
5647
5648 /* If the condition is true, find the end of the assertion so that
5649 advancing past it gets us to the start of the first branch. */
5650
5651 if (condition)
5652 {
5653 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5654 }
5655 break; /* End of assertion condition */
5656 }
5657
5658#undef Lpositive
5659#undef Lstart_branch
5660
5661 /* Choose branch according to the condition. */
5662
5663 Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength;
5664
5665 /* If the opcode is OP_SCOND it means we are at a repeated conditional
5666 group that might match an empty string. We must therefore descend a level
5667 so that the start is remembered for checking. For OP_COND we can just
5668 continue at this level. */
5669
5670 if (Fop == OP_SCOND)
5671 {
5672 group_frame_type = GF_NOCAPTURE | Fop;
5673 RMATCH(Fecode, RM35);
5674 RRETURN(rrc);
5675 }
5676 break;
5677
5678
5679
5680/* ========================================================================= */
5681/* End of start of parenthesis opcodes */
5682/* ========================================================================= */
5683
5684
5685 /* ===================================================================== */
5686 /* Move the subject pointer back. This occurs only at the start of each
5687 branch of a lookbehind assertion. If we are too close to the start to move
5688 back, fail. When working with UTF-8 we move back a number of characters,
5689 not bytes. */
5690
5691 case OP_REVERSE:
5692 number = GET(Fecode, 1);
5693#ifdef SUPPORT_UNICODE
5694 if (utf)
5695 {
5696 while (number-- > 0)
5697 {
5698 if (Feptr <= mb->check_subject) RRETURN(MATCH_NOMATCH);
5699 Feptr--;
5700 BACKCHAR(Feptr);
5701 }
5702 }
5703 else
5704#endif
5705
5706 /* No UTF-8 support, or not in UTF-8 mode: count is code unit count */
5707
5708 {
5709 if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH);
5710 Feptr -= number;
5711 }
5712
5713 /* Save the earliest consulted character, then skip to next opcode */
5714
5715 if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr;
5716 Fecode += 1 + LINK_SIZE;
5717 break;
5718
5719
5720 /* ===================================================================== */
5721 /* An alternation is the end of a branch; scan along to find the end of the
5722 bracketed group. */
5723
5724 case OP_ALT:
5725 do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
5726 break;
5727
5728
5729 /* ===================================================================== */
5730 /* The end of a parenthesized group. For all but OP_BRA and OP_COND, the
5731 starting frame was added to the chained frames in order to remember the
5732 starting subject position for the group. */
5733
5734 case OP_KET:
5735 case OP_KETRMIN:
5736 case OP_KETRMAX:
5737 case OP_KETRPOS:
5738
5739 bracode = Fecode - GET(Fecode, 1);
5740
5741 /* Point N to the frame at the start of the most recent group.
5742 Remember the subject pointer at the start of the group. */
5743
5744 if (*bracode != OP_BRA && *bracode != OP_COND)
5745 {
5746 N = (heapframe *)((char *)match_data->heapframes + Flast_group_offset);
5747 P = (heapframe *)((char *)N - frame_size);
5748 Flast_group_offset = P->last_group_offset;
5749
5750#ifdef DEBUG_SHOW_RMATCH
5751 fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n",
5752 N->rdepth, N->group_frame_type,
5753 (char *)P->eptr - (char *)mb->start_subject);
5754#endif
5755
5756 /* If we are at the end of an assertion that is a condition, return a
5757 match, discarding any intermediate backtracking points. Copy back the
5758 mark setting and the captures into the frame before N so that they are
5759 set on return. Doing this for all assertions, both positive and negative,
5760 seems to match what Perl does. */
5761
5762 if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT)
5763 {
5764 memcpy((char *)P + offsetof(heapframe, ovector), Fovector,
5765 Foffset_top * sizeof(PCRE2_SIZE));
5766 P->offset_top = Foffset_top;
5767 P->mark = Fmark;
5768 Fback_frame = (char *)F - (char *)P;
5769 RRETURN(MATCH_MATCH);
5770 }
5771 }
5772 else P = NULL; /* Indicates starting frame not recorded */
5773
5774 /* The group was not a conditional assertion. */
5775
5776 switch (*bracode)
5777 {
5778 case OP_BRA: /* No need to do anything for these */
5779 case OP_COND:
5780 case OP_SCOND:
5781 break;
5782
5783 /* Non-atomic positive assertions are like OP_BRA, except that the
5784 subject pointer must be put back to where it was at the start of the
5785 assertion. */
5786
5787 case OP_ASSERT_NA:
5788 case OP_ASSERTBACK_NA:
5789 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
5790 Feptr = P->eptr;
5791 break;
5792
5793 /* Atomic positive assertions are like OP_ONCE, except that in addition
5794 the subject pointer must be put back to where it was at the start of the
5795 assertion. */
5796
5797 case OP_ASSERT:
5798 case OP_ASSERTBACK:
5799 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
5800 Feptr = P->eptr;
5801 /* Fall through */
5802
5803 /* For an atomic group, discard internal backtracking points. We must
5804 also ensure that any remaining branches within the top-level of the group
5805 are not tried. Do this by adjusting the code pointer within the backtrack
5806 frame so that it points to the final branch. */
5807
5808 case OP_ONCE:
5809 Fback_frame = ((char *)F - (char *)P);
5810 for (;;)
5811 {
5812 uint32_t y = GET(P->ecode,1);
5813 if ((P->ecode)[y] != OP_ALT) break;
5814 P->ecode += y;
5815 }
5816 break;
5817
5818 /* A matching negative assertion returns MATCH, which is turned into
5819 NOMATCH at the assertion level. */
5820
5821 case OP_ASSERT_NOT:
5822 case OP_ASSERTBACK_NOT:
5823 RRETURN(MATCH_MATCH);
5824
5825 /* At the end of a script run, apply the script-checking rules. This code
5826 will never by exercised if Unicode support it not compiled, because in
5827 that environment script runs cause an error at compile time. */
5828
5829 case OP_SCRIPT_RUN:
5830 if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH);
5831 break;
5832
5833 /* Whole-pattern recursion is coded as a recurse into group 0, so it
5834 won't be picked up here. Instead, we catch it when the OP_END is reached.
5835 Other recursion is handled here. */
5836
5837 case OP_CBRA:
5838 case OP_CBRAPOS:
5839 case OP_SCBRA:
5840 case OP_SCBRAPOS:
5841 number = GET2(bracode, 1+LINK_SIZE);
5842
5843 /* Handle a recursively called group. We reinstate the previous set of
5844 captures and then carry on after the recursion call. */
5845
5846 if (Fcurrent_recurse == number)
5847 {
5848 P = (heapframe *)((char *)N - frame_size);
5849 memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,
5850 P->offset_top * sizeof(PCRE2_SIZE));
5851 Foffset_top = P->offset_top;
5852 Fcapture_last = P->capture_last;
5853 Fcurrent_recurse = P->current_recurse;
5854 Fecode = P->ecode + 1 + LINK_SIZE;
5855 continue; /* With next opcode */
5856 }
5857
5858 /* Deal with actual capturing. */
5859
5860 offset = (number << 1) - 2;
5861 Fcapture_last = number;
5862 Fovector[offset] = P->eptr - mb->start_subject;
5863 Fovector[offset+1] = Feptr - mb->start_subject;
5864 if (offset >= Foffset_top) Foffset_top = offset + 2;
5865 break;
5866 } /* End actions relating to the starting opcode */
5867
5868 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
5869 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
5870 at a time from the outer level. This must precede the empty string test -
5871 in this case that test is done at the outer level. */
5872
5873 if (*Fecode == OP_KETRPOS)
5874 {
5875 memcpy((char *)P + offsetof(heapframe, eptr),
5876 (char *)F + offsetof(heapframe, eptr),
5877 frame_copy_size);
5878 RRETURN(MATCH_KETRPOS);
5879 }
5880
5881 /* Handle the different kinds of closing brackets. A non-repeating ket
5882 needs no special action, just continuing at this level. This also happens
5883 for the repeating kets if the group matched no characters, in order to
5884 forcibly break infinite loops. Otherwise, the repeating kets try the rest
5885 of the pattern or restart from the preceding bracket, in the appropriate
5886 order. */
5887
5888 if (Fop != OP_KET && (P == NULL || Feptr != P->eptr))
5889 {
5890 if (Fop == OP_KETRMIN)
5891 {
5892 RMATCH(Fecode + 1 + LINK_SIZE, RM6);
5893 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5894 Fecode -= GET(Fecode, 1);
5895 break; /* End of ket processing */
5896 }
5897
5898 /* Repeat the maximum number of times (KETRMAX) */
5899
5900 RMATCH(bracode, RM7);
5901 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5902 }
5903
5904 /* Carry on at this level for a non-repeating ket, or after matching an
5905 empty string, or after repeating for a maximum number of times. */
5906
5907 Fecode += 1 + LINK_SIZE;
5908 break;
5909
5910
5911 /* ===================================================================== */
5912 /* Start and end of line assertions, not multiline mode. */
5913
5914 case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */
5915 if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0)
5916 RRETURN(MATCH_NOMATCH);
5917 Fecode++;
5918 break;
5919
5920 case OP_SOD: /* Unconditional start of subject */
5921 if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH);
5922 Fecode++;
5923 break;
5924
5925 /* When PCRE2_NOTEOL is unset, assert before the subject end, or a
5926 terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */
5927
5928 case OP_DOLL:
5929 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
5930 if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;
5931
5932 /* Fall through */
5933 /* Unconditional end of subject assertion (\z) */
5934
5935 case OP_EOD:
5936 if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH);
5937 if (mb->partial != 0)
5938 {
5939 mb->hitend = TRUE;
5940 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5941 }
5942 Fecode++;
5943 break;
5944
5945 /* End of subject or ending \n assertion (\Z) */
5946
5947 case OP_EODN:
5948 ASSERT_NL_OR_EOS:
5949 if (Feptr < mb->end_subject &&
5950 (!IS_NEWLINE(Feptr) || Feptr != mb->end_subject - mb->nllen))
5951 {
5952 if (mb->partial != 0 &&
5953 Feptr + 1 >= mb->end_subject &&
5954 NLBLOCK->nltype == NLTYPE_FIXED &&
5955 NLBLOCK->nllen == 2 &&
5956 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
5957 {
5958 mb->hitend = TRUE;
5959 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5960 }
5961 RRETURN(MATCH_NOMATCH);
5962 }
5963
5964 /* Either at end of string or \n before end. */
5965
5966 if (mb->partial != 0)
5967 {
5968 mb->hitend = TRUE;
5969 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5970 }
5971 Fecode++;
5972 break;
5973
5974
5975 /* ===================================================================== */
5976 /* Start and end of line assertions, multiline mode. */
5977
5978 /* Start of subject unless notbol, or after any newline except for one at
5979 the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */
5980
5981 case OP_CIRCM:
5982 if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject)
5983 RRETURN(MATCH_NOMATCH);
5984 if (Feptr != mb->start_subject &&
5985 ((Feptr == mb->end_subject &&
5986 (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) ||
5987 !WAS_NEWLINE(Feptr)))
5988 RRETURN(MATCH_NOMATCH);
5989 Fecode++;
5990 break;
5991
5992 /* Assert before any newline, or before end of subject unless noteol is
5993 set. */
5994
5995 case OP_DOLLM:
5996 if (Feptr < mb->end_subject)
5997 {
5998 if (!IS_NEWLINE(Feptr))
5999 {
6000 if (mb->partial != 0 &&
6001 Feptr + 1 >= mb->end_subject &&
6002 NLBLOCK->nltype == NLTYPE_FIXED &&
6003 NLBLOCK->nllen == 2 &&
6004 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
6005 {
6006 mb->hitend = TRUE;
6007 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
6008 }
6009 RRETURN(MATCH_NOMATCH);
6010 }
6011 }
6012 else
6013 {
6014 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
6015 SCHECK_PARTIAL();
6016 }
6017 Fecode++;
6018 break;
6019
6020
6021 /* ===================================================================== */
6022 /* Start of match assertion */
6023
6024 case OP_SOM:
6025 if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH);
6026 Fecode++;
6027 break;
6028
6029
6030 /* ===================================================================== */
6031 /* Reset the start of match point */
6032
6033 case OP_SET_SOM:
6034 Fstart_match = Feptr;
6035 Fecode++;
6036 break;
6037
6038
6039 /* ===================================================================== */
6040 /* Word boundary assertions. Find out if the previous and current
6041 characters are "word" characters. It takes a bit more work in UTF mode.
6042 Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is
6043 not set. When it is set, use Unicode properties if available, even when not
6044 in UTF mode. Remember the earliest and latest consulted characters. */
6045
6046 case OP_NOT_WORD_BOUNDARY:
6047 case OP_WORD_BOUNDARY:
6048 if (Feptr == mb->check_subject) prev_is_word = FALSE; else
6049 {
6050 PCRE2_SPTR lastptr = Feptr - 1;
6051#ifdef SUPPORT_UNICODE
6052 if (utf)
6053 {
6054 BACKCHAR(lastptr);
6055 GETCHAR(fc, lastptr);
6056 }
6057 else
6058#endif /* SUPPORT_UNICODE */
6059 fc = *lastptr;
6060 if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr;
6061#ifdef SUPPORT_UNICODE
6062 if ((mb->poptions & PCRE2_UCP) != 0)
6063 {
6064 if (fc == '_') prev_is_word = TRUE; else
6065 {
6066 int cat = UCD_CATEGORY(fc);
6067 prev_is_word = (cat == ucp_L || cat == ucp_N);
6068 }
6069 }
6070 else
6071#endif /* SUPPORT_UNICODE */
6072 prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
6073 }
6074
6075 /* Get status of next character */
6076
6077 if (Feptr >= mb->end_subject)
6078 {
6079 SCHECK_PARTIAL();
6080 cur_is_word = FALSE;
6081 }
6082 else
6083 {
6084 PCRE2_SPTR nextptr = Feptr + 1;
6085#ifdef SUPPORT_UNICODE
6086 if (utf)
6087 {
6088 FORWARDCHARTEST(nextptr, mb->end_subject);
6089 GETCHAR(fc, Feptr);
6090 }
6091 else
6092#endif /* SUPPORT_UNICODE */
6093 fc = *Feptr;
6094 if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr;
6095#ifdef SUPPORT_UNICODE
6096 if ((mb->poptions & PCRE2_UCP) != 0)
6097 {
6098 if (fc == '_') cur_is_word = TRUE; else
6099 {
6100 int cat = UCD_CATEGORY(fc);
6101 cur_is_word = (cat == ucp_L || cat == ucp_N);
6102 }
6103 }
6104 else
6105#endif /* SUPPORT_UNICODE */
6106 cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
6107 }
6108
6109 /* Now see if the situation is what we want */
6110
6111 if ((*Fecode++ == OP_WORD_BOUNDARY)?
6112 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
6113 RRETURN(MATCH_NOMATCH);
6114 break;
6115
6116
6117 /* ===================================================================== */
6118 /* Backtracking (*VERB)s, with and without arguments. Note that if the
6119 pattern is successfully matched, we do not come back from RMATCH. */
6120
6121 case OP_MARK:
6122 Fmark = mb->nomatch_mark = Fecode + 2;
6123 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12);
6124
6125 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
6126 argument, and we must check whether that argument matches this MARK's
6127 argument. It is passed back in mb->verb_skip_ptr. If it does match, we
6128 return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject
6129 position that corresponds to this mark. Otherwise, pass back the return
6130 code unaltered. */
6131
6132 if (rrc == MATCH_SKIP_ARG &&
6133 PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0)
6134 {
6135 mb->verb_skip_ptr = Feptr; /* Pass back current position */
6136 RRETURN(MATCH_SKIP);
6137 }
6138 RRETURN(rrc);
6139
6140 case OP_FAIL:
6141 RRETURN(MATCH_NOMATCH);
6142
6143 /* Record the current recursing group number in mb->verb_current_recurse
6144 when a backtracking return such as MATCH_COMMIT is given. This enables the
6145 recurse processing to catch verbs from within the recursion. */
6146
6147 case OP_COMMIT:
6148 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13);
6149 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6150 mb->verb_current_recurse = Fcurrent_recurse;
6151 RRETURN(MATCH_COMMIT);
6152
6153 case OP_COMMIT_ARG:
6154 Fmark = mb->nomatch_mark = Fecode + 2;
6155 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36);
6156 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6157 mb->verb_current_recurse = Fcurrent_recurse;
6158 RRETURN(MATCH_COMMIT);
6159
6160 case OP_PRUNE:
6161 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14);
6162 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6163 mb->verb_current_recurse = Fcurrent_recurse;
6164 RRETURN(MATCH_PRUNE);
6165
6166 case OP_PRUNE_ARG:
6167 Fmark = mb->nomatch_mark = Fecode + 2;
6168 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15);
6169 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6170 mb->verb_current_recurse = Fcurrent_recurse;
6171 RRETURN(MATCH_PRUNE);
6172
6173 case OP_SKIP:
6174 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16);
6175 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6176 mb->verb_skip_ptr = Feptr; /* Pass back current position */
6177 mb->verb_current_recurse = Fcurrent_recurse;
6178 RRETURN(MATCH_SKIP);
6179
6180 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
6181 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
6182 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
6183 that failed and any that precede it (either they also failed, or were not
6184 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
6185 SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg
6186 set to the count of the one that failed. */
6187
6188 case OP_SKIP_ARG:
6189 mb->skip_arg_count++;
6190 if (mb->skip_arg_count <= mb->ignore_skip_arg)
6191 {
6192 Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1];
6193 break;
6194 }
6195 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17);
6196 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6197
6198 /* Pass back the current skip name and return the special MATCH_SKIP_ARG
6199 return code. This will either be caught by a matching MARK, or get to the
6200 top, where it causes a rematch with mb->ignore_skip_arg set to the value of
6201 mb->skip_arg_count. */
6202
6203 mb->verb_skip_ptr = Fecode + 2;
6204 mb->verb_current_recurse = Fcurrent_recurse;
6205 RRETURN(MATCH_SKIP_ARG);
6206
6207 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
6208 the branch in which it occurs can be determined. */
6209
6210 case OP_THEN:
6211 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18);
6212 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6213 mb->verb_ecode_ptr = Fecode;
6214 mb->verb_current_recurse = Fcurrent_recurse;
6215 RRETURN(MATCH_THEN);
6216
6217 case OP_THEN_ARG:
6218 Fmark = mb->nomatch_mark = Fecode + 2;
6219 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19);
6220 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6221 mb->verb_ecode_ptr = Fecode;
6222 mb->verb_current_recurse = Fcurrent_recurse;
6223 RRETURN(MATCH_THEN);
6224
6225
6226 /* ===================================================================== */
6227 /* There's been some horrible disaster. Arrival here can only mean there is
6228 something seriously wrong in the code above or the OP_xxx definitions. */
6229
6230 default:
6231 return PCRE2_ERROR_INTERNAL;
6232 }
6233
6234 /* Do not insert any code in here without much thought; it is assumed
6235 that "continue" in the code above comes out to here to repeat the main
6236 loop. */
6237
6238 } /* End of main loop */
6239/* Control never reaches here */
6240
6241
6242/* ========================================================================= */
6243/* The RRETURN() macro jumps here. The number that is saved in Freturn_id
6244indicates which label we actually want to return to. The value in Frdepth is
6245the index number of the frame in the vector. The return value has been placed
6246in rrc. */
6247
6248#define LBL(val) case val: goto L_RM##val;
6249
6250RETURN_SWITCH:
6251if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
6252if (Frdepth == 0) return rrc; /* Exit from the top level */
6253F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */
6254mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */
6255
6256#ifdef DEBUG_SHOW_RMATCH
6257fprintf(stderr, "++ RETURN %d to %d\n", rrc, Freturn_id);
6258#endif
6259
6260switch (Freturn_id)
6261 {
6262 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6263 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
6264 LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
6265 LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
6266 LBL(33) LBL(34) LBL(35) LBL(36)
6267
6268#ifdef SUPPORT_WIDE_CHARS
6269 LBL(100) LBL(101)
6270#endif
6271
6272#ifdef SUPPORT_UNICODE
6273 LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206)
6274 LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213)
6275 LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220)
6276 LBL(221) LBL(222) LBL(223) LBL(224) LBL(225)
6277#endif
6278
6279 default:
6280 return PCRE2_ERROR_INTERNAL;
6281 }
6282#undef LBL
6283}
6284
6285
6286/*************************************************
6287* Match a Regular Expression *
6288*************************************************/
6289
6290/* This function applies a compiled pattern to a subject string and picks out
6291portions of the string if it matches. Two elements in the vector are set for
6292each substring: the offsets to the start and end of the substring.
6293
6294Arguments:
6295 code points to the compiled expression
6296 subject points to the subject string
6297 length length of subject string (may contain binary zeros)
6298 start_offset where to start in the subject string
6299 options option bits
6300 match_data points to a match_data block
6301 mcontext points a PCRE2 context
6302
6303Returns: > 0 => success; value is the number of ovector pairs filled
6304 = 0 => success, but ovector is not big enough
6305 = -1 => failed to match (PCRE2_ERROR_NOMATCH)
6306 = -2 => partial match (PCRE2_ERROR_PARTIAL)
6307 < -2 => some kind of unexpected problem
6308*/
6309
6310PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
6311pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
6312 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
6313 pcre2_match_context *mcontext)
6314{
6315int rc;
6316int was_zero_terminated = 0;
6317const uint8_t *start_bits = NULL;
6318const pcre2_real_code *re = (const pcre2_real_code *)code;
6319
6320BOOL anchored;
6321BOOL firstline;
6322BOOL has_first_cu = FALSE;
6323BOOL has_req_cu = FALSE;
6324BOOL startline;
6325
6326#if PCRE2_CODE_UNIT_WIDTH == 8
6327PCRE2_SPTR memchr_found_first_cu;
6328PCRE2_SPTR memchr_found_first_cu2;
6329#endif
6330
6331PCRE2_UCHAR first_cu = 0;
6332PCRE2_UCHAR first_cu2 = 0;
6333PCRE2_UCHAR req_cu = 0;
6334PCRE2_UCHAR req_cu2 = 0;
6335
6336PCRE2_SPTR bumpalong_limit;
6337PCRE2_SPTR end_subject;
6338PCRE2_SPTR true_end_subject;
6339PCRE2_SPTR start_match;
6340PCRE2_SPTR req_cu_ptr;
6341PCRE2_SPTR start_partial;
6342PCRE2_SPTR match_partial;
6343
6344#ifdef SUPPORT_JIT
6345BOOL use_jit;
6346#endif
6347
6348/* This flag is needed even when Unicode is not supported for convenience
6349(it is used by the IS_NEWLINE macro). */
6350
6351BOOL utf = FALSE;
6352
6353#ifdef SUPPORT_UNICODE
6354BOOL ucp = FALSE;
6355BOOL allow_invalid;
6356uint32_t fragment_options = 0;
6357#ifdef SUPPORT_JIT
6358BOOL jit_checked_utf = FALSE;
6359#endif
6360#endif /* SUPPORT_UNICODE */
6361
6362PCRE2_SIZE frame_size;
6363PCRE2_SIZE heapframes_size;
6364
6365/* We need to have mb as a pointer to a match block, because the IS_NEWLINE
6366macro is used below, and it expects NLBLOCK to be defined as a pointer. */
6367
6368pcre2_callout_block cb;
6369match_block actual_match_block;
6370match_block *mb = &actual_match_block;
6371
6372/* Recognize NULL, length 0 as an empty string. */
6373
6374if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
6375
6376/* Plausibility checks */
6377
6378if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
6379if (code == NULL || subject == NULL || match_data == NULL)
6380 return PCRE2_ERROR_NULL;
6381
6382start_match = subject + start_offset;
6383req_cu_ptr = start_match - 1;
6384if (length == PCRE2_ZERO_TERMINATED)
6385 {
6386 length = PRIV(strlen)(subject);
6387 was_zero_terminated = 1;
6388 }
6389true_end_subject = end_subject = subject + length;
6390
6391if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
6392
6393/* Check that the first field in the block is the magic number. */
6394
6395if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
6396
6397/* Check the code unit width. */
6398
6399if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
6400 return PCRE2_ERROR_BADMODE;
6401
6402/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
6403options variable for this function. Users of PCRE2 who are not calling the
6404function directly would like to have a way of setting these flags, in the same
6405way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
6406constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
6407(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now
6408transfer to the options for this function. The bits are guaranteed to be
6409adjacent, but do not have the same values. This bit of Boolean trickery assumes
6410that the match-time bits are not more significant than the flag bits. If by
6411accident this is not the case, a compile-time division by zero error will
6412occur. */
6413
6414#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
6415#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
6416options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
6417#undef FF
6418#undef OO
6419
6420/* If the pattern was successfully studied with JIT support, we will run the
6421JIT executable instead of the rest of this function. Most options must be set
6422at compile time for the JIT code to be usable. */
6423
6424#ifdef SUPPORT_JIT
6425use_jit = (re->executable_jit != NULL &&
6426 (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
6427#endif
6428
6429/* Initialize UTF/UCP parameters. */
6430
6431#ifdef SUPPORT_UNICODE
6432utf = (re->overall_options & PCRE2_UTF) != 0;
6433allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
6434ucp = (re->overall_options & PCRE2_UCP) != 0;
6435#endif /* SUPPORT_UNICODE */
6436
6437/* Convert the partial matching flags into an integer. */
6438
6439mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :
6440 ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0;
6441
6442/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
6443time. */
6444
6445if (mb->partial != 0 &&
6446 ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
6447 return PCRE2_ERROR_BADOPTION;
6448
6449/* It is an error to set an offset limit without setting the flag at compile
6450time. */
6451
6452if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET &&
6453 (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
6454 return PCRE2_ERROR_BADOFFSETLIMIT;
6455
6456/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
6457free the memory that was obtained. Set the field to NULL for no match cases. */
6458
6459if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
6460 {
6461 match_data->memctl.free((void *)match_data->subject,
6462 match_data->memctl.memory_data);
6463 match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
6464 }
6465match_data->subject = NULL;
6466
6467/* Zero the error offset in case the first code unit is invalid UTF. */
6468
6469match_data->startchar = 0;
6470
6471
6472/* ============================= JIT matching ============================== */
6473
6474/* Prepare for JIT matching. Check a UTF string for validity unless no check is
6475requested or invalid UTF can be handled. We check only the portion of the
6476subject that might be be inspected during matching - from the offset minus the
6477maximum lookbehind to the given length. This saves time when a small part of a
6478large subject is being matched by the use of a starting offset. Note that the
6479maximum lookbehind is a number of characters, not code units. */
6480
6481#ifdef SUPPORT_JIT
6482if (use_jit)
6483 {
6484#ifdef SUPPORT_UNICODE
6485 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0 && !allow_invalid)
6486 {
6487#if PCRE2_CODE_UNIT_WIDTH != 32
6488 unsigned int i;
6489#endif
6490
6491 /* For 8-bit and 16-bit UTF, check that the first code unit is a valid
6492 character start. */
6493
6494#if PCRE2_CODE_UNIT_WIDTH != 32
6495 if (start_match < end_subject && NOT_FIRSTCU(*start_match))
6496 {
6497 if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET;
6498#if PCRE2_CODE_UNIT_WIDTH == 8
6499 return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */
6500#else
6501 return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */
6502#endif
6503 }
6504#endif /* WIDTH != 32 */
6505
6506 /* Move back by the maximum lookbehind, just in case it happens at the very
6507 start of matching. */
6508
6509#if PCRE2_CODE_UNIT_WIDTH != 32
6510 for (i = re->max_lookbehind; i > 0 && start_match > subject; i--)
6511 {
6512 start_match--;
6513 while (start_match > subject &&
6514#if PCRE2_CODE_UNIT_WIDTH == 8
6515 (*start_match & 0xc0) == 0x80)
6516#else /* 16-bit */
6517 (*start_match & 0xfc00) == 0xdc00)
6518#endif
6519 start_match--;
6520 }
6521#else /* PCRE2_CODE_UNIT_WIDTH != 32 */
6522
6523 /* In the 32-bit library, one code unit equals one character. However,
6524 we cannot just subtract the lookbehind and then compare pointers, because
6525 a very large lookbehind could create an invalid pointer. */
6526
6527 if (start_offset >= re->max_lookbehind)
6528 start_match -= re->max_lookbehind;
6529 else
6530 start_match = subject;
6531#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
6532
6533 /* Validate the relevant portion of the subject. Adjust the offset of an
6534 invalid code point to be an absolute offset in the whole string. */
6535
6536 match_data->rc = PRIV(valid_utf)(start_match,
6537 length - (start_match - subject), &(match_data->startchar));
6538 if (match_data->rc != 0)
6539 {
6540 match_data->startchar += start_match - subject;
6541 return match_data->rc;
6542 }
6543 jit_checked_utf = TRUE;
6544 }
6545#endif /* SUPPORT_UNICODE */
6546
6547 /* If JIT returns BADOPTION, which means that the selected complete or
6548 partial matching mode was not compiled, fall through to the interpreter. */
6549
6550 rc = pcre2_jit_match(code, subject, length, start_offset, options,
6551 match_data, mcontext);
6552 if (rc != PCRE2_ERROR_JIT_BADOPTION)
6553 {
6554 if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
6555 {
6556 length = CU2BYTES(length + was_zero_terminated);
6557 match_data->subject = match_data->memctl.malloc(length,
6558 match_data->memctl.memory_data);
6559 if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
6560 memcpy((void *)match_data->subject, subject, length);
6561 match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
6562 }
6563 return rc;
6564 }
6565 }
6566#endif /* SUPPORT_JIT */
6567
6568/* ========================= End of JIT matching ========================== */
6569
6570
6571/* Proceed with non-JIT matching. The default is to allow lookbehinds to the
6572start of the subject. A UTF check when there is a non-zero offset may change
6573this. */
6574
6575mb->check_subject = subject;
6576
6577/* If a UTF subject string was not checked for validity in the JIT code above,
6578check it here, and handle support for invalid UTF strings. The check above
6579happens only when invalid UTF is not supported and PCRE2_NO_CHECK_UTF is unset.
6580If we get here in those circumstances, it means the subject string is valid,
6581but for some reason JIT matching was not successful. There is no need to check
6582the subject again.
6583
6584We check only the portion of the subject that might be be inspected during
6585matching - from the offset minus the maximum lookbehind to the given length.
6586This saves time when a small part of a large subject is being matched by the
6587use of a starting offset. Note that the maximum lookbehind is a number of
6588characters, not code units.
6589
6590Note also that support for invalid UTF forces a check, overriding the setting
6591of PCRE2_NO_CHECK_UTF. */
6592
6593#ifdef SUPPORT_UNICODE
6594if (utf &&
6595#ifdef SUPPORT_JIT
6596 !jit_checked_utf &&
6597#endif
6598 ((options & PCRE2_NO_UTF_CHECK) == 0 || allow_invalid))
6599 {
6600#if PCRE2_CODE_UNIT_WIDTH != 32
6601 BOOL skipped_bad_start = FALSE;
6602#endif
6603
6604 /* For 8-bit and 16-bit UTF, check that the first code unit is a valid
6605 character start. If we are handling invalid UTF, just skip over such code
6606 units. Otherwise, give an appropriate error. */
6607
6608#if PCRE2_CODE_UNIT_WIDTH != 32
6609 if (allow_invalid)
6610 {
6611 while (start_match < end_subject && NOT_FIRSTCU(*start_match))
6612 {
6613 start_match++;
6614 skipped_bad_start = TRUE;
6615 }
6616 }
6617 else if (start_match < end_subject && NOT_FIRSTCU(*start_match))
6618 {
6619 if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET;
6620#if PCRE2_CODE_UNIT_WIDTH == 8
6621 return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */
6622#else
6623 return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */
6624#endif
6625 }
6626#endif /* WIDTH != 32 */
6627
6628 /* The mb->check_subject field points to the start of UTF checking;
6629 lookbehinds can go back no further than this. */
6630
6631 mb->check_subject = start_match;
6632
6633 /* Move back by the maximum lookbehind, just in case it happens at the very
6634 start of matching, but don't do this if we skipped bad 8-bit or 16-bit code
6635 units above. */
6636
6637#if PCRE2_CODE_UNIT_WIDTH != 32
6638 if (!skipped_bad_start)
6639 {
6640 unsigned int i;
6641 for (i = re->max_lookbehind; i > 0 && mb->check_subject > subject; i--)
6642 {
6643 mb->check_subject--;
6644 while (mb->check_subject > subject &&
6645#if PCRE2_CODE_UNIT_WIDTH == 8
6646 (*mb->check_subject & 0xc0) == 0x80)
6647#else /* 16-bit */
6648 (*mb->check_subject & 0xfc00) == 0xdc00)
6649#endif
6650 mb->check_subject--;
6651 }
6652 }
6653#else /* PCRE2_CODE_UNIT_WIDTH != 32 */
6654
6655 /* In the 32-bit library, one code unit equals one character. However,
6656 we cannot just subtract the lookbehind and then compare pointers, because
6657 a very large lookbehind could create an invalid pointer. */
6658
6659 if (start_offset >= re->max_lookbehind)
6660 mb->check_subject -= re->max_lookbehind;
6661 else
6662 mb->check_subject = subject;
6663#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
6664
6665 /* Validate the relevant portion of the subject. There's a loop in case we
6666 encounter bad UTF in the characters preceding start_match which we are
6667 scanning because of a lookbehind. */
6668
6669 for (;;)
6670 {
6671 match_data->rc = PRIV(valid_utf)(mb->check_subject,
6672 length - (mb->check_subject - subject), &(match_data->startchar));
6673
6674 if (match_data->rc == 0) break; /* Valid UTF string */
6675
6676 /* Invalid UTF string. Adjust the offset to be an absolute offset in the
6677 whole string. If we are handling invalid UTF strings, set end_subject to
6678 stop before the bad code unit, and set the options to "not end of line".
6679 Otherwise return the error. */
6680
6681 match_data->startchar += mb->check_subject - subject;
6682 if (!allow_invalid || match_data->rc > 0) return match_data->rc;
6683 end_subject = subject + match_data->startchar;
6684
6685 /* If the end precedes start_match, it means there is invalid UTF in the
6686 extra code units we reversed over because of a lookbehind. Advance past the
6687 first bad code unit, and then skip invalid character starting code units in
6688 8-bit and 16-bit modes, and try again with the original end point. */
6689
6690 if (end_subject < start_match)
6691 {
6692 mb->check_subject = end_subject + 1;
6693#if PCRE2_CODE_UNIT_WIDTH != 32
6694 while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject))
6695 mb->check_subject++;
6696#endif
6697 end_subject = true_end_subject;
6698 }
6699
6700 /* Otherwise, set the not end of line option, and do the match. */
6701
6702 else
6703 {
6704 fragment_options = PCRE2_NOTEOL;
6705 break;
6706 }
6707 }
6708 }
6709#endif /* SUPPORT_UNICODE */
6710
6711/* A NULL match context means "use a default context", but we take the memory
6712control functions from the pattern. */
6713
6714if (mcontext == NULL)
6715 {
6716 mcontext = (pcre2_match_context *)(&PRIV(default_match_context));
6717 mb->memctl = re->memctl;
6718 }
6719else mb->memctl = mcontext->memctl;
6720
6721anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0;
6722firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
6723startline = (re->flags & PCRE2_STARTLINE) != 0;
6724bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)?
6725 true_end_subject : subject + mcontext->offset_limit;
6726
6727/* Initialize and set up the fixed fields in the callout block, with a pointer
6728in the match block. */
6729
6730mb->cb = &cb;
6731cb.version = 2;
6732cb.subject = subject;
6733cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
6734cb.callout_flags = 0;
6735
6736/* Fill in the remaining fields in the match block, except for moptions, which
6737gets set later. */
6738
6739mb->callout = mcontext->callout;
6740mb->callout_data = mcontext->callout_data;
6741
6742mb->start_subject = subject;
6743mb->start_offset = start_offset;
6744mb->end_subject = end_subject;
6745mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
6746mb->allowemptypartial = (re->max_lookbehind > 0) ||
6747 (re->flags & PCRE2_MATCH_EMPTY) != 0;
6748mb->poptions = re->overall_options; /* Pattern options */
6749mb->ignore_skip_arg = 0;
6750mb->mark = mb->nomatch_mark = NULL; /* In case never set */
6751
6752/* The name table is needed for finding all the numbers associated with a
6753given name, for condition testing. The code follows the name table. */
6754
6755mb->name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
6756mb->name_count = re->name_count;
6757mb->name_entry_size = re->name_entry_size;
6758mb->start_code = mb->name_table + re->name_count * re->name_entry_size;
6759
6760/* Process the \R and newline settings. */
6761
6762mb->bsr_convention = re->bsr_convention;
6763mb->nltype = NLTYPE_FIXED;
6764switch(re->newline_convention)
6765 {
6766 case PCRE2_NEWLINE_CR:
6767 mb->nllen = 1;
6768 mb->nl[0] = CHAR_CR;
6769 break;
6770
6771 case PCRE2_NEWLINE_LF:
6772 mb->nllen = 1;
6773 mb->nl[0] = CHAR_NL;
6774 break;
6775
6776 case PCRE2_NEWLINE_NUL:
6777 mb->nllen = 1;
6778 mb->nl[0] = CHAR_NUL;
6779 break;
6780
6781 case PCRE2_NEWLINE_CRLF:
6782 mb->nllen = 2;
6783 mb->nl[0] = CHAR_CR;
6784 mb->nl[1] = CHAR_NL;
6785 break;
6786
6787 case PCRE2_NEWLINE_ANY:
6788 mb->nltype = NLTYPE_ANY;
6789 break;
6790
6791 case PCRE2_NEWLINE_ANYCRLF:
6792 mb->nltype = NLTYPE_ANYCRLF;
6793 break;
6794
6795 default: return PCRE2_ERROR_INTERNAL;
6796 }
6797
6798/* The backtracking frames have fixed data at the front, and a PCRE2_SIZE
6799vector at the end, whose size depends on the number of capturing parentheses in
6800the pattern. It is not used at all if there are no capturing parentheses.
6801
6802 frame_size is the total size of each frame
6803 match_data->heapframes is the pointer to the frames vector
6804 match_data->heapframes_size is the total size of the vector
6805
6806We must pad the frame_size for alignment to ensure subsequent frames are as
6807aligned as heapframe. Whilst ovector is word-aligned due to being a PCRE2_SIZE
6808array, that does not guarantee it is suitably aligned for pointers, as some
6809architectures have pointers that are larger than a size_t. */
6810
6811frame_size = (offsetof(heapframe, ovector) +
6812 re->top_bracket * 2 * sizeof(PCRE2_SIZE) + HEAPFRAME_ALIGNMENT - 1) &
6813 ~(HEAPFRAME_ALIGNMENT - 1);
6814
6815/* Limits set in the pattern override the match context only if they are
6816smaller. */
6817
6818mb->heap_limit = ((mcontext->heap_limit < re->limit_heap)?
6819 mcontext->heap_limit : re->limit_heap) * 1024;
6820
6821mb->match_limit = (mcontext->match_limit < re->limit_match)?
6822 mcontext->match_limit : re->limit_match;
6823
6824mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)?
6825 mcontext->depth_limit : re->limit_depth;
6826
6827/* If a pattern has very many capturing parentheses, the frame size may be very
6828large. Set the initial frame vector size to ensure that there are at least 10
6829available frames, but enforce a minimum of START_FRAMES_SIZE. If this is
6830greater than the heap limit, get as large a vector as possible. Always round
6831the size to a multiple of the frame size. */
6832
6833heapframes_size = frame_size * 10;
6834if (heapframes_size < START_FRAMES_SIZE) heapframes_size = START_FRAMES_SIZE;
6835if (heapframes_size > mb->heap_limit)
6836 {
6837 if (frame_size > mb->heap_limit ) return PCRE2_ERROR_HEAPLIMIT;
6838 heapframes_size = mb->heap_limit;
6839 }
6840
6841/* If an existing frame vector in the match_data block is large enough, we can
6842use it.Otherwise, free any pre-existing vector and get a new one. */
6843
6844if (match_data->heapframes_size < heapframes_size)
6845 {
6846 match_data->memctl.free(match_data->heapframes,
6847 match_data->memctl.memory_data);
6848 match_data->heapframes = match_data->memctl.malloc(heapframes_size,
6849 match_data->memctl.memory_data);
6850 if (match_data->heapframes == NULL)
6851 {
6852 match_data->heapframes_size = 0;
6853 return PCRE2_ERROR_NOMEMORY;
6854 }
6855 match_data->heapframes_size = heapframes_size;
6856 }
6857
6858/* Write to the ovector within the first frame to mark every capture unset and
6859to avoid uninitialized memory read errors when it is copied to a new frame. */
6860
6861memset((char *)(match_data->heapframes) + offsetof(heapframe, ovector), 0xff,
6862 frame_size - offsetof(heapframe, ovector));
6863
6864/* Pointers to the individual character tables */
6865
6866mb->lcc = re->tables + lcc_offset;
6867mb->fcc = re->tables + fcc_offset;
6868mb->ctypes = re->tables + ctypes_offset;
6869
6870/* Set up the first code unit to match, if available. If there's no first code
6871unit there may be a bitmap of possible first characters. */
6872
6873if ((re->flags & PCRE2_FIRSTSET) != 0)
6874 {
6875 has_first_cu = TRUE;
6876 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
6877 if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
6878 {
6879 first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
6880#ifdef SUPPORT_UNICODE
6881#if PCRE2_CODE_UNIT_WIDTH == 8
6882 if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);
6883#else
6884 if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);
6885#endif
6886#endif /* SUPPORT_UNICODE */
6887 }
6888 }
6889else
6890 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
6891 start_bits = re->start_bitmap;
6892
6893/* There may also be a "last known required character" set. */
6894
6895if ((re->flags & PCRE2_LASTSET) != 0)
6896 {
6897 has_req_cu = TRUE;
6898 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
6899 if ((re->flags & PCRE2_LASTCASELESS) != 0)
6900 {
6901 req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
6902#ifdef SUPPORT_UNICODE
6903#if PCRE2_CODE_UNIT_WIDTH == 8
6904 if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);
6905#else
6906 if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);
6907#endif
6908#endif /* SUPPORT_UNICODE */
6909 }
6910 }
6911
6912
6913/* ==========================================================================*/
6914
6915/* Loop for handling unanchored repeated matching attempts; for anchored regexs
6916the loop runs just once. */
6917
6918#ifdef SUPPORT_UNICODE
6919FRAGMENT_RESTART:
6920#endif
6921
6922start_partial = match_partial = NULL;
6923mb->hitend = FALSE;
6924
6925#if PCRE2_CODE_UNIT_WIDTH == 8
6926memchr_found_first_cu = NULL;
6927memchr_found_first_cu2 = NULL;
6928#endif
6929
6930for(;;)
6931 {
6932 PCRE2_SPTR new_start_match;
6933
6934 /* ----------------- Start of match optimizations ---------------- */
6935
6936 /* There are some optimizations that avoid running the match if a known
6937 starting point is not found, or if a known later code unit is not present.
6938 However, there is an option (settable at compile time) that disables these,
6939 for testing and for ensuring that all callouts do actually occur. */
6940
6941 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
6942 {
6943 /* If firstline is TRUE, the start of the match is constrained to the first
6944 line of a multiline string. That is, the match must be before or at the
6945 first newline following the start of matching. Temporarily adjust
6946 end_subject so that we stop the scans for a first code unit at a newline.
6947 If the match fails at the newline, later code breaks the loop. */
6948
6949 if (firstline)
6950 {
6951 PCRE2_SPTR t = start_match;
6952#ifdef SUPPORT_UNICODE
6953 if (utf)
6954 {
6955 while (t < end_subject && !IS_NEWLINE(t))
6956 {
6957 t++;
6958 ACROSSCHAR(t < end_subject, t, t++);
6959 }
6960 }
6961 else
6962#endif
6963 while (t < end_subject && !IS_NEWLINE(t)) t++;
6964 end_subject = t;
6965 }
6966
6967 /* Anchored: check the first code unit if one is recorded. This may seem
6968 pointless but it can help in detecting a no match case without scanning for
6969 the required code unit. */
6970
6971 if (anchored)
6972 {
6973 if (has_first_cu || start_bits != NULL)
6974 {
6975 BOOL ok = start_match < end_subject;
6976 if (ok)
6977 {
6978 PCRE2_UCHAR c = UCHAR21TEST(start_match);
6979 ok = has_first_cu && (c == first_cu || c == first_cu2);
6980 if (!ok && start_bits != NULL)
6981 {
6982#if PCRE2_CODE_UNIT_WIDTH != 8
6983 if (c > 255) c = 255;
6984#endif
6985 ok = (start_bits[c/8] & (1u << (c&7))) != 0;
6986 }
6987 }
6988 if (!ok)
6989 {
6990 rc = MATCH_NOMATCH;
6991 break;
6992 }
6993 }
6994 }
6995
6996 /* Not anchored. Advance to a unique first code unit if there is one. */
6997
6998 else
6999 {
7000 if (has_first_cu)
7001 {
7002 if (first_cu != first_cu2) /* Caseless */
7003 {
7004 /* In 16-bit and 32_bit modes we have to do our own search, so can
7005 look for both cases at once. */
7006
7007#if PCRE2_CODE_UNIT_WIDTH != 8
7008 PCRE2_UCHAR smc;
7009 while (start_match < end_subject &&
7010 (smc = UCHAR21TEST(start_match)) != first_cu &&
7011 smc != first_cu2)
7012 start_match++;
7013#else
7014 /* In 8-bit mode, the use of memchr() gives a big speed up, even
7015 though we have to call it twice in order to find the earliest
7016 occurrence of the code unit in either of its cases. Caching is used
7017 to remember the positions of previously found code units. This can
7018 make a huge difference when the strings are very long and only one
7019 case is actually present. */
7020
7021 PCRE2_SPTR pp1 = NULL;
7022 PCRE2_SPTR pp2 = NULL;
7023 PCRE2_SIZE searchlength = end_subject - start_match;
7024
7025 /* If we haven't got a previously found position for first_cu, or if
7026 the current starting position is later, we need to do a search. If
7027 the code unit is not found, set it to the end. */
7028
7029 if (memchr_found_first_cu == NULL ||
7030 start_match > memchr_found_first_cu)
7031 {
7032 pp1 = memchr(start_match, first_cu, searchlength);
7033 memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
7034 }
7035
7036 /* If the start is before a previously found position, use the
7037 previous position, or NULL if a previous search failed. */
7038
7039 else pp1 = (memchr_found_first_cu == end_subject)? NULL :
7040 memchr_found_first_cu;
7041
7042 /* Do the same thing for the other case. */
7043
7044 if (memchr_found_first_cu2 == NULL ||
7045 start_match > memchr_found_first_cu2)
7046 {
7047 pp2 = memchr(start_match, first_cu2, searchlength);
7048 memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
7049 }
7050
7051 else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
7052 memchr_found_first_cu2;
7053
7054 /* Set the start to the end of the subject if neither case was found.
7055 Otherwise, use the earlier found point. */
7056
7057 if (pp1 == NULL)
7058 start_match = (pp2 == NULL)? end_subject : pp2;
7059 else
7060 start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
7061
7062#endif /* 8-bit handling */
7063 }
7064
7065 /* The caseful case is much simpler. */
7066
7067 else
7068 {
7069#if PCRE2_CODE_UNIT_WIDTH != 8
7070 while (start_match < end_subject && UCHAR21TEST(start_match) !=
7071 first_cu)
7072 start_match++;
7073#else
7074 start_match = memchr(start_match, first_cu, end_subject - start_match);
7075 if (start_match == NULL) start_match = end_subject;
7076#endif
7077 }
7078
7079 /* If we can't find the required first code unit, having reached the
7080 true end of the subject, break the bumpalong loop, to force a match
7081 failure, except when doing partial matching, when we let the next cycle
7082 run at the end of the subject. To see why, consider the pattern
7083 /(?<=abc)def/, which partially matches "abc", even though the string
7084 does not contain the starting character "d". If we have not reached the
7085 true end of the subject (PCRE2_FIRSTLINE caused end_subject to be
7086 temporarily modified) we also let the cycle run, because the matching
7087 string is legitimately allowed to start with the first code unit of a
7088 newline. */
7089
7090 if (mb->partial == 0 && start_match >= mb->end_subject)
7091 {
7092 rc = MATCH_NOMATCH;
7093 break;
7094 }
7095 }
7096
7097 /* If there's no first code unit, advance to just after a linebreak for a
7098 multiline match if required. */
7099
7100 else if (startline)
7101 {
7102 if (start_match > mb->start_subject + start_offset)
7103 {
7104#ifdef SUPPORT_UNICODE
7105 if (utf)
7106 {
7107 while (start_match < end_subject && !WAS_NEWLINE(start_match))
7108 {
7109 start_match++;
7110 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
7111 }
7112 }
7113 else
7114#endif
7115 while (start_match < end_subject && !WAS_NEWLINE(start_match))
7116 start_match++;
7117
7118 /* If we have just passed a CR and the newline option is ANY or
7119 ANYCRLF, and we are now at a LF, advance the match position by one
7120 more code unit. */
7121
7122 if (start_match[-1] == CHAR_CR &&
7123 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
7124 start_match < end_subject &&
7125 UCHAR21TEST(start_match) == CHAR_NL)
7126 start_match++;
7127 }
7128 }
7129
7130 /* If there's no first code unit or a requirement for a multiline line
7131 start, advance to a non-unique first code unit if any have been
7132 identified. The bitmap contains only 256 bits. When code units are 16 or
7133 32 bits wide, all code units greater than 254 set the 255 bit. */
7134
7135 else if (start_bits != NULL)
7136 {
7137 while (start_match < end_subject)
7138 {
7139 uint32_t c = UCHAR21TEST(start_match);
7140#if PCRE2_CODE_UNIT_WIDTH != 8
7141 if (c > 255) c = 255;
7142#endif
7143 if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
7144 start_match++;
7145 }
7146
7147 /* See comment above in first_cu checking about the next few lines. */
7148
7149 if (mb->partial == 0 && start_match >= mb->end_subject)
7150 {
7151 rc = MATCH_NOMATCH;
7152 break;
7153 }
7154 }
7155 } /* End first code unit handling */
7156
7157 /* Restore fudged end_subject */
7158
7159 end_subject = mb->end_subject;
7160
7161 /* The following two optimizations must be disabled for partial matching. */
7162
7163 if (mb->partial == 0)
7164 {
7165 PCRE2_SPTR p;
7166
7167 /* The minimum matching length is a lower bound; no string of that length
7168 may actually match the pattern. Although the value is, strictly, in
7169 characters, we treat it as code units to avoid spending too much time in
7170 this optimization. */
7171
7172 if (end_subject - start_match < re->minlength)
7173 {
7174 rc = MATCH_NOMATCH;
7175 break;
7176 }
7177
7178 /* If req_cu is set, we know that that code unit must appear in the
7179 subject for the (non-partial) match to succeed. If the first code unit is
7180 set, req_cu must be later in the subject; otherwise the test starts at
7181 the match point. This optimization can save a huge amount of backtracking
7182 in patterns with nested unlimited repeats that aren't going to match.
7183 Writing separate code for caseful/caseless versions makes it go faster,
7184 as does using an autoincrement and backing off on a match. As in the case
7185 of the first code unit, using memchr() in the 8-bit library gives a big
7186 speed up. Unlike the first_cu check above, we do not need to call
7187 memchr() twice in the caseless case because we only need to check for the
7188 presence of the character in either case, not find the first occurrence.
7189
7190 The search can be skipped if the code unit was found later than the
7191 current starting point in a previous iteration of the bumpalong loop.
7192
7193 HOWEVER: when the subject string is very, very long, searching to its end
7194 can take a long time, and give bad performance on quite ordinary
7195 anchored patterns. This showed up when somebody was matching something
7196 like /^\d+C/ on a 32-megabyte string... so we don't do this when the
7197 string is sufficiently long, but it's worth searching a lot more for
7198 unanchored patterns. */
7199
7200 p = start_match + (has_first_cu? 1:0);
7201 if (has_req_cu && p > req_cu_ptr)
7202 {
7203 PCRE2_SIZE check_length = end_subject - start_match;
7204
7205 if (check_length < REQ_CU_MAX ||
7206 (!anchored && check_length < REQ_CU_MAX * 1000))
7207 {
7208 if (req_cu != req_cu2) /* Caseless */
7209 {
7210#if PCRE2_CODE_UNIT_WIDTH != 8
7211 while (p < end_subject)
7212 {
7213 uint32_t pp = UCHAR21INCTEST(p);
7214 if (pp == req_cu || pp == req_cu2) { p--; break; }
7215 }
7216#else /* 8-bit code units */
7217 PCRE2_SPTR pp = p;
7218 p = memchr(pp, req_cu, end_subject - pp);
7219 if (p == NULL)
7220 {
7221 p = memchr(pp, req_cu2, end_subject - pp);
7222 if (p == NULL) p = end_subject;
7223 }
7224#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
7225 }
7226
7227 /* The caseful case */
7228
7229 else
7230 {
7231#if PCRE2_CODE_UNIT_WIDTH != 8
7232 while (p < end_subject)
7233 {
7234 if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
7235 }
7236
7237#else /* 8-bit code units */
7238 p = memchr(p, req_cu, end_subject - p);
7239 if (p == NULL) p = end_subject;
7240#endif
7241 }
7242
7243 /* If we can't find the required code unit, break the bumpalong loop,
7244 forcing a match failure. */
7245
7246 if (p >= end_subject)
7247 {
7248 rc = MATCH_NOMATCH;
7249 break;
7250 }
7251
7252 /* If we have found the required code unit, save the point where we
7253 found it, so that we don't search again next time round the bumpalong
7254 loop if the start hasn't yet passed this code unit. */
7255
7256 req_cu_ptr = p;
7257 }
7258 }
7259 }
7260 }
7261
7262 /* ------------ End of start of match optimizations ------------ */
7263
7264 /* Give no match if we have passed the bumpalong limit. */
7265
7266 if (start_match > bumpalong_limit)
7267 {
7268 rc = MATCH_NOMATCH;
7269 break;
7270 }
7271
7272 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
7273 first starting point for which a partial match was found. */
7274
7275 cb.start_match = (PCRE2_SIZE)(start_match - subject);
7276 cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH;
7277
7278 mb->start_used_ptr = start_match;
7279 mb->last_used_ptr = start_match;
7280#ifdef SUPPORT_UNICODE
7281 mb->moptions = options | fragment_options;
7282#else
7283 mb->moptions = options;
7284#endif
7285 mb->match_call_count = 0;
7286 mb->end_offset_top = 0;
7287 mb->skip_arg_count = 0;
7288
7289 rc = match(start_match, mb->start_code, re->top_bracket, frame_size,
7290 match_data, mb);
7291
7292 if (mb->hitend && start_partial == NULL)
7293 {
7294 start_partial = mb->start_used_ptr;
7295 match_partial = start_match;
7296 }
7297
7298 switch(rc)
7299 {
7300 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
7301 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
7302 entirely. The only way we can do that is to re-do the match at the same
7303 point, with a flag to force SKIP with an argument to be ignored. Just
7304 treating this case as NOMATCH does not work because it does not check other
7305 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
7306
7307 case MATCH_SKIP_ARG:
7308 new_start_match = start_match;
7309 mb->ignore_skip_arg = mb->skip_arg_count;
7310 break;
7311
7312 /* SKIP passes back the next starting point explicitly, but if it is no
7313 greater than the match we have just done, treat it as NOMATCH. */
7314
7315 case MATCH_SKIP:
7316 if (mb->verb_skip_ptr > start_match)
7317 {
7318 new_start_match = mb->verb_skip_ptr;
7319 break;
7320 }
7321 /* Fall through */
7322
7323 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
7324 exactly like PRUNE. Unset ignore SKIP-with-argument. */
7325
7326 case MATCH_NOMATCH:
7327 case MATCH_PRUNE:
7328 case MATCH_THEN:
7329 mb->ignore_skip_arg = 0;
7330 new_start_match = start_match + 1;
7331#ifdef SUPPORT_UNICODE
7332 if (utf)
7333 ACROSSCHAR(new_start_match < end_subject, new_start_match,
7334 new_start_match++);
7335#endif
7336 break;
7337
7338 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
7339
7340 case MATCH_COMMIT:
7341 rc = MATCH_NOMATCH;
7342 goto ENDLOOP;
7343
7344 /* Any other return is either a match, or some kind of error. */
7345
7346 default:
7347 goto ENDLOOP;
7348 }
7349
7350 /* Control reaches here for the various types of "no match at this point"
7351 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
7352
7353 rc = MATCH_NOMATCH;
7354
7355 /* If PCRE2_FIRSTLINE is set, the match must happen before or at the first
7356 newline in the subject (though it may continue over the newline). Therefore,
7357 if we have just failed to match, starting at a newline, do not continue. */
7358
7359 if (firstline && IS_NEWLINE(start_match)) break;
7360
7361 /* Advance to new matching position */
7362
7363 start_match = new_start_match;
7364
7365 /* Break the loop if the pattern is anchored or if we have passed the end of
7366 the subject. */
7367
7368 if (anchored || start_match > end_subject) break;
7369
7370 /* If we have just passed a CR and we are now at a LF, and the pattern does
7371 not contain any explicit matches for \r or \n, and the newline option is CRLF
7372 or ANY or ANYCRLF, advance the match position by one more code unit. In
7373 normal matching start_match will aways be greater than the first position at
7374 this stage, but a failed *SKIP can cause a return at the same point, which is
7375 why the first test exists. */
7376
7377 if (start_match > subject + start_offset &&
7378 start_match[-1] == CHAR_CR &&
7379 start_match < end_subject &&
7380 *start_match == CHAR_NL &&
7381 (re->flags & PCRE2_HASCRORLF) == 0 &&
7382 (mb->nltype == NLTYPE_ANY ||
7383 mb->nltype == NLTYPE_ANYCRLF ||
7384 mb->nllen == 2))
7385 start_match++;
7386
7387 mb->mark = NULL; /* Reset for start of next match attempt */
7388 } /* End of for(;;) "bumpalong" loop */
7389
7390/* ==========================================================================*/
7391
7392/* When we reach here, one of the following stopping conditions is true:
7393
7394(1) The match succeeded, either completely, or partially;
7395
7396(2) The pattern is anchored or the match was failed after (*COMMIT);
7397
7398(3) We are past the end of the subject or the bumpalong limit;
7399
7400(4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because
7401 this option requests that a match occur at or before the first newline in
7402 the subject.
7403
7404(5) Some kind of error occurred.
7405
7406*/
7407
7408ENDLOOP:
7409
7410/* If end_subject != true_end_subject, it means we are handling invalid UTF,
7411and have just processed a non-terminal fragment. If this resulted in no match
7412or a partial match we must carry on to the next fragment (a partial match is
7413returned to the caller only at the very end of the subject). A loop is used to
7414avoid trying to match against empty fragments; if the pattern can match an
7415empty string it would have done so already. */
7416
7417#ifdef SUPPORT_UNICODE
7418if (utf && end_subject != true_end_subject &&
7419 (rc == MATCH_NOMATCH || rc == PCRE2_ERROR_PARTIAL))
7420 {
7421 for (;;)
7422 {
7423 /* Advance past the first bad code unit, and then skip invalid character
7424 starting code units in 8-bit and 16-bit modes. */
7425
7426 start_match = end_subject + 1;
7427
7428#if PCRE2_CODE_UNIT_WIDTH != 32
7429 while (start_match < true_end_subject && NOT_FIRSTCU(*start_match))
7430 start_match++;
7431#endif
7432
7433 /* If we have hit the end of the subject, there isn't another non-empty
7434 fragment, so give up. */
7435
7436 if (start_match >= true_end_subject)
7437 {
7438 rc = MATCH_NOMATCH; /* In case it was partial */
7439 break;
7440 }
7441
7442 /* Check the rest of the subject */
7443
7444 mb->check_subject = start_match;
7445 rc = PRIV(valid_utf)(start_match, length - (start_match - subject),
7446 &(match_data->startchar));
7447
7448 /* The rest of the subject is valid UTF. */
7449
7450 if (rc == 0)
7451 {
7452 mb->end_subject = end_subject = true_end_subject;
7453 fragment_options = PCRE2_NOTBOL;
7454 goto FRAGMENT_RESTART;
7455 }
7456
7457 /* A subsequent UTF error has been found; if the next fragment is
7458 non-empty, set up to process it. Otherwise, let the loop advance. */
7459
7460 else if (rc < 0)
7461 {
7462 mb->end_subject = end_subject = start_match + match_data->startchar;
7463 if (end_subject > start_match)
7464 {
7465 fragment_options = PCRE2_NOTBOL|PCRE2_NOTEOL;
7466 goto FRAGMENT_RESTART;
7467 }
7468 }
7469 }
7470 }
7471#endif /* SUPPORT_UNICODE */
7472
7473/* Fill in fields that are always returned in the match data. */
7474
7475match_data->code = re;
7476match_data->mark = mb->mark;
7477match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER;
7478
7479/* Handle a fully successful match. Set the return code to the number of
7480captured strings, or 0 if there were too many to fit into the ovector, and then
7481set the remaining returned values before returning. Make a copy of the subject
7482string if requested. */
7483
7484if (rc == MATCH_MATCH)
7485 {
7486 match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)?
7487 0 : (int)mb->end_offset_top/2 + 1;
7488 match_data->startchar = start_match - subject;
7489 match_data->leftchar = mb->start_used_ptr - subject;
7490 match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)?
7491 mb->last_used_ptr : mb->end_match_ptr) - subject;
7492 if ((options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
7493 {
7494 length = CU2BYTES(length + was_zero_terminated);
7495 match_data->subject = match_data->memctl.malloc(length,
7496 match_data->memctl.memory_data);
7497 if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
7498 memcpy((void *)match_data->subject, subject, length);
7499 match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
7500 }
7501 else match_data->subject = subject;
7502 return match_data->rc;
7503 }
7504
7505/* Control gets here if there has been a partial match, an error, or if the
7506overall match attempt has failed at all permitted starting positions. Any mark
7507data is in the nomatch_mark field. */
7508
7509match_data->mark = mb->nomatch_mark;
7510
7511/* For anything other than nomatch or partial match, just return the code. */
7512
7513if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc;
7514
7515/* Handle a partial match. If a "soft" partial match was requested, searching
7516for a complete match will have continued, and the value of rc at this point
7517will be MATCH_NOMATCH. For a "hard" partial match, it will already be
7518PCRE2_ERROR_PARTIAL. */
7519
7520else if (match_partial != NULL)
7521 {
7522 match_data->subject = subject;
7523 match_data->ovector[0] = match_partial - subject;
7524 match_data->ovector[1] = end_subject - subject;
7525 match_data->startchar = match_partial - subject;
7526 match_data->leftchar = start_partial - subject;
7527 match_data->rightchar = end_subject - subject;
7528 match_data->rc = PCRE2_ERROR_PARTIAL;
7529 }
7530
7531/* Else this is the classic nomatch case. */
7532
7533else match_data->rc = PCRE2_ERROR_NOMATCH;
7534
7535return match_data->rc;
7536}
7537
7538/* These #undefs are here to enable unity builds with CMake. */
7539
7540#undef NLBLOCK /* Block containing newline information */
7541#undef PSSTART /* Field containing processed string start */
7542#undef PSEND /* Field containing processed string end */
7543
7544/* End of pcre2_match.c */
7545