1/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2018 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40/* This module contains pcre_exec(), the externally visible function that does
41pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42possible. There are also some static supporting functions. */
43
44#pragma warning( disable : 4127) // conditional expression is constant
45#pragma warning( disable : 4244) // conversion from 'int' to 'unsigned short', possible loss of data
46
47#include "pcre_config.h"
48
49#define NLBLOCK md /* Block containing newline information */
50#define PSSTART start_subject /* Field containing processed string start */
51#define PSEND end_subject /* Field containing processed string end */
52
53#include "pcre_internal.h"
54
55/* Undefine some potentially clashing cpp symbols */
56
57#undef min
58#undef max
59
60/* The md->capture_last field uses the lower 16 bits for the last captured
61substring (which can never be greater than 65535) and a bit in the top half
62to mean "capture vector overflowed". This odd way of doing things was
63implemented when it was realized that preserving and restoring the overflow bit
64whenever the last capture number was saved/restored made for a neater
65interface, and doing it this way saved on (a) another variable, which would
66have increased the stack frame size (a big NO-NO in PCRE) and (b) another
67separate set of save/restore instructions. The following defines are used in
68implementing this. */
69
70#define CAPLMASK 0x0000ffff /* The bits used for last_capture */
71#define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
72#define OVFLBIT 0x00010000 /* The bit that is set for overflow */
73
74/* Values for setting in md->match_function_type to indicate two special types
75of call to match(). We do it this way to save on using another stack variable,
76as stack usage is to be discouraged. */
77
78#define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
79#define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
80
81/* Non-error returns from the match() function. Error returns are externally
82defined PCRE_ERROR_xxx codes, which are all negative. */
83
84#define MATCH_MATCH 1
85#define MATCH_NOMATCH 0
86
87/* Special internal returns from the match() function. Make them sufficiently
88negative to avoid the external error codes. */
89
90#define MATCH_ACCEPT (-999)
91#define MATCH_KETRPOS (-998)
92#define MATCH_ONCE (-997)
93/* The next 5 must be kept together and in sequence so that a test that checks
94for any one of them can use a range. */
95#define MATCH_COMMIT (-996)
96#define MATCH_PRUNE (-995)
97#define MATCH_SKIP (-994)
98#define MATCH_SKIP_ARG (-993)
99#define MATCH_THEN (-992)
100#define MATCH_BACKTRACK_MAX MATCH_THEN
101#define MATCH_BACKTRACK_MIN MATCH_COMMIT
102
103/* Maximum number of ints of offset to save on the stack for recursive calls.
104If the offset vector is bigger, malloc is used. This should be a multiple of 3,
105because the offset vector is always a multiple of 3 long. */
106
107#define REC_STACK_SAVE_MAX 30
108
109/* Min and max values for the common repeats; for the maxima, 0 => infinity */
110
111static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
112static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
113
114#ifdef PCRE_DEBUG
115/*************************************************
116* Debugging function to print chars *
117*************************************************/
118
119/* Print a sequence of chars in printable format, stopping at the end of the
120subject if the requested.
121
122Arguments:
123 p points to characters
124 length number to print
125 is_subject TRUE if printing from within md->start_subject
126 md pointer to matching data block, if is_subject is TRUE
127
128Returns: nothing
129*/
130
131static void
132pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
133{
134pcre_uint32 c;
135BOOL utf = md->utf;
136if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
137while (length-- > 0)
138 if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
139}
140#endif
141
142
143
144/*************************************************
145* Match a back-reference *
146*************************************************/
147
148/* Normally, if a back reference hasn't been set, the length that is passed is
149negative, so the match always fails. However, in JavaScript compatibility mode,
150the length passed is zero. Note that in caseless UTF-8 mode, the number of
151subject bytes matched may be different to the number of reference bytes.
152
153Arguments:
154 offset index into the offset vector
155 eptr pointer into the subject
156 length length of reference to be matched (number of bytes)
157 md points to match data block
158 caseless TRUE if caseless
159
160Returns: >= 0 the number of subject bytes matched
161 -1 no match
162 -2 partial match; always given if at end subject
163*/
164
165static int
166match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
167 BOOL caseless)
168{
169PCRE_PUCHAR eptr_start = eptr;
170register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
171#if defined SUPPORT_UTF && defined SUPPORT_UCP
172BOOL utf = md->utf;
173#endif
174
175#ifdef PCRE_DEBUG
176if (eptr >= md->end_subject)
177 printf("matching subject <null>");
178else
179 {
180 printf("matching subject ");
181 pchars(eptr, length, TRUE, md);
182 }
183printf(" against backref ");
184pchars(p, length, FALSE, md);
185printf("\n");
186#endif
187
188/* Always fail if reference not set (and not JavaScript compatible - in that
189case the length is passed as zero). */
190
191if (length < 0) return -1;
192
193/* Separate the caseless case for speed. In UTF-8 mode we can only do this
194properly if Unicode properties are supported. Otherwise, we can check only
195ASCII characters. */
196
197if (caseless)
198 {
199#if defined SUPPORT_UTF && defined SUPPORT_UCP
200 if (utf)
201 {
202 /* Match characters up to the end of the reference. NOTE: the number of
203 data units matched may differ, because in UTF-8 there are some characters
204 whose upper and lower case versions code have different numbers of bytes.
205 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
206 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
207 sequence of two of the latter. It is important, therefore, to check the
208 length along the reference, not along the subject (earlier code did this
209 wrong). */
210
211 PCRE_PUCHAR endptr = p + length;
212 while (p < endptr)
213 {
214 pcre_uint32 c, d;
215 const ucd_record *ur;
216 if (eptr >= md->end_subject) return -2; /* Partial match */
217 GETCHARINC(c, eptr);
218 GETCHARINC(d, p);
219 ur = GET_UCD(d);
220 if (c != d && c != d + ur->other_case)
221 {
222 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
223 for (;;)
224 {
225 if (c < *pp) return -1;
226 if (c == *pp++) break;
227 }
228 }
229 }
230 }
231 else
232#endif
233
234 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
235 is no UCP support. */
236 {
237 while (length-- > 0)
238 {
239 pcre_uint32 cc, cp;
240 if (eptr >= md->end_subject) return -2; /* Partial match */
241 cc = UCHAR21TEST(eptr);
242 cp = UCHAR21TEST(p);
243 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
244 p++;
245 eptr++;
246 }
247 }
248 }
249
250/* In the caseful case, we can just compare the bytes, whether or not we
251are in UTF-8 mode. */
252
253else
254 {
255 while (length-- > 0)
256 {
257 if (eptr >= md->end_subject) return -2; /* Partial match */
258 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;
259 }
260 }
261
262return (int)(eptr - eptr_start);
263}
264
265
266
267/***************************************************************************
268****************************************************************************
269 RECURSION IN THE match() FUNCTION
270
271The match() function is highly recursive, though not every recursive call
272increases the recursive depth. Nevertheless, some regular expressions can cause
273it to recurse to a great depth. I was writing for Unix, so I just let it call
274itself recursively. This uses the stack for saving everything that has to be
275saved for a recursive call. On Unix, the stack can be large, and this works
276fine.
277
278It turns out that on some non-Unix-like systems there are problems with
279programs that use a lot of stack. (This despite the fact that every last chip
280has oodles of memory these days, and techniques for extending the stack have
281been known for decades.) So....
282
283There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
284calls by keeping local variables that need to be preserved in blocks of memory
285obtained from malloc() instead instead of on the stack. Macros are used to
286achieve this so that the actual code doesn't look very different to what it
287always used to.
288
289The original heap-recursive code used longjmp(). However, it seems that this
290can be very slow on some operating systems. Following a suggestion from Stan
291Switzer, the use of longjmp() has been abolished, at the cost of having to
292provide a unique number for each call to RMATCH. There is no way of generating
293a sequence of numbers at compile time in C. I have given them names, to make
294them stand out more clearly.
295
296Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
297FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
298tests. Furthermore, not using longjmp() means that local dynamic variables
299don't have indeterminate values; this has meant that the frame size can be
300reduced because the result can be "passed back" by straight setting of the
301variable instead of being passed in the frame.
302****************************************************************************
303***************************************************************************/
304
305/* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
306below must be updated in sync. */
307
308enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
309 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
310 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
311 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
312 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
313 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
314 RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
315
316/* These versions of the macros use the stack, as normal. There are debugging
317versions and production versions. Note that the "rw" argument of RMATCH isn't
318actually used in this definition. */
319
320#ifndef NO_RECURSE
321#define REGISTER register
322
323#ifdef PCRE_DEBUG
324#define RMATCH(ra,rb,rc,rd,re,rw) \
325 { \
326 printf("match() called in line %d\n", __LINE__); \
327 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
328 printf("to line %d\n", __LINE__); \
329 }
330#define RRETURN(ra) \
331 { \
332 printf("match() returned %d from line %d\n", ra, __LINE__); \
333 return ra; \
334 }
335#else
336#define RMATCH(ra,rb,rc,rd,re,rw) \
337 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
338#define RRETURN(ra) return ra
339#endif
340
341#else
342
343
344/* These versions of the macros manage a private stack on the heap. Note that
345the "rd" argument of RMATCH isn't actually used in this definition. It's the md
346argument of match(), which never changes. */
347
348#define REGISTER
349
350#define RMATCH(ra,rb,rc,rd,re,rw)\
351 {\
352 heapframe *newframe = frame->Xnextframe;\
353 if (newframe == NULL)\
354 {\
355 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
356 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
357 newframe->Xnextframe = NULL;\
358 frame->Xnextframe = newframe;\
359 }\
360 frame->Xwhere = rw;\
361 newframe->Xeptr = ra;\
362 newframe->Xecode = rb;\
363 newframe->Xmstart = mstart;\
364 newframe->Xoffset_top = rc;\
365 newframe->Xeptrb = re;\
366 newframe->Xrdepth = frame->Xrdepth + 1;\
367 newframe->Xprevframe = frame;\
368 frame = newframe;\
369 DPRINTF(("restarting from line %d\n", __LINE__));\
370 goto HEAP_RECURSE;\
371 L_##rw:\
372 DPRINTF(("jumped back to line %d\n", __LINE__));\
373 }
374
375#define RRETURN(ra)\
376 {\
377 heapframe *oldframe = frame;\
378 frame = oldframe->Xprevframe;\
379 if (frame != NULL)\
380 {\
381 rrc = ra;\
382 goto HEAP_RETURN;\
383 }\
384 return ra;\
385 }
386
387
388/* Structure for remembering the local variables in a private frame */
389
390typedef struct heapframe {
391 struct heapframe *Xprevframe;
392 struct heapframe *Xnextframe;
393
394 /* Function arguments that may change */
395
396 PCRE_PUCHAR Xeptr;
397 const pcre_uchar *Xecode;
398 PCRE_PUCHAR Xmstart;
399 int Xoffset_top;
400 eptrblock *Xeptrb;
401 unsigned int Xrdepth;
402
403 /* Function local variables */
404
405 PCRE_PUCHAR Xcallpat;
406#ifdef SUPPORT_UTF
407 PCRE_PUCHAR Xcharptr;
408#endif
409 PCRE_PUCHAR Xdata;
410 PCRE_PUCHAR Xnext;
411 PCRE_PUCHAR Xpp;
412 PCRE_PUCHAR Xprev;
413 PCRE_PUCHAR Xsaved_eptr;
414
415 recursion_info Xnew_recursive;
416
417 BOOL Xcur_is_word;
418 BOOL Xcondition;
419 BOOL Xprev_is_word;
420
421#ifdef SUPPORT_UCP
422 int Xprop_type;
423 unsigned int Xprop_value;
424 int Xprop_fail_result;
425 int Xoclength;
426 pcre_uchar Xocchars[6];
427#endif
428
429 int Xcodelink;
430 int Xctype;
431 unsigned int Xfc;
432 int Xfi;
433 int Xlength;
434 int Xmax;
435 int Xmin;
436 unsigned int Xnumber;
437 int Xoffset;
438 unsigned int Xop;
439 pcre_int32 Xsave_capture_last;
440 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
441 int Xstacksave[REC_STACK_SAVE_MAX];
442
443 eptrblock Xnewptrb;
444
445 /* Where to jump back to */
446
447 int Xwhere;
448
449} heapframe;
450
451#endif
452
453
454/***************************************************************************
455***************************************************************************/
456
457
458
459/*************************************************
460* Match from current position *
461*************************************************/
462
463/* This function is called recursively in many circumstances. Whenever it
464returns a negative (error) response, the outer incarnation must also return the
465same response. */
466
467/* These macros pack up tests that are used for partial matching, and which
468appear several times in the code. We set the "hit end" flag if the pointer is
469at the end of the subject and also past the start of the subject (i.e.
470something has been matched). For hard partial matching, we then return
471immediately. The second one is used when we already know we are past the end of
472the subject. */
473
474#define CHECK_PARTIAL()\
475 if (md->partial != 0 && eptr >= md->end_subject && \
476 eptr > md->start_used_ptr) \
477 { \
478 md->hitend = TRUE; \
479 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
480 }
481
482#define SCHECK_PARTIAL()\
483 if (md->partial != 0 && eptr > md->start_used_ptr) \
484 { \
485 md->hitend = TRUE; \
486 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
487 }
488
489
490/* Performance note: It might be tempting to extract commonly used fields from
491the md structure (e.g. utf, end_subject) into individual variables to improve
492performance. Tests using gcc on a SPARC disproved this; in the first case, it
493made performance worse.
494
495Arguments:
496 eptr pointer to current character in subject
497 ecode pointer to current position in compiled code
498 mstart pointer to the current match start position (can be modified
499 by encountering \K)
500 offset_top current top pointer
501 md pointer to "static" info for the match
502 eptrb pointer to chain of blocks containing eptr at start of
503 brackets - for testing for empty matches
504 rdepth the recursion depth
505
506Returns: MATCH_MATCH if matched ) these values are >= 0
507 MATCH_NOMATCH if failed to match )
508 a negative MATCH_xxx value for PRUNE, SKIP, etc
509 a negative PCRE_ERROR_xxx value if aborted by an error condition
510 (e.g. stopped by repeated call or recursion limit)
511*/
512
513static int
514match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
515 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
516 unsigned int rdepth)
517{
518/* These variables do not need to be preserved over recursion in this function,
519so they can be ordinary variables in all cases. Mark some of them with
520"register" because they are used a lot in loops. */
521
522register int rrc; /* Returns from recursive calls */
523register int i; /* Used for loops not involving calls to RMATCH() */
524register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
525register BOOL utf; /* Local copy of UTF flag for speed */
526
527BOOL minimize, possessive; /* Quantifier options */
528BOOL caseless;
529int condcode;
530
531/* When recursion is not being used, all "local" variables that have to be
532preserved over calls to RMATCH() are part of a "frame". We set up the top-level
533frame on the stack here; subsequent instantiations are obtained from the heap
534whenever RMATCH() does a "recursion". See the macro definitions above. Putting
535the top-level on the stack rather than malloc-ing them all gives a performance
536boost in many cases where there is not much "recursion". */
537
538#ifdef NO_RECURSE
539heapframe *frame = (heapframe *)md->match_frames_base;
540
541/* Copy in the original argument variables */
542
543frame->Xeptr = eptr;
544frame->Xecode = ecode;
545frame->Xmstart = mstart;
546frame->Xoffset_top = offset_top;
547frame->Xeptrb = eptrb;
548frame->Xrdepth = rdepth;
549
550/* This is where control jumps back to to effect "recursion" */
551
552HEAP_RECURSE:
553
554/* Macros make the argument variables come from the current frame */
555
556#define eptr frame->Xeptr
557#define ecode frame->Xecode
558#define mstart frame->Xmstart
559#define offset_top frame->Xoffset_top
560#define eptrb frame->Xeptrb
561#define rdepth frame->Xrdepth
562
563/* Ditto for the local variables */
564
565#ifdef SUPPORT_UTF
566#define charptr frame->Xcharptr
567#endif
568#define callpat frame->Xcallpat
569#define codelink frame->Xcodelink
570#define data frame->Xdata
571#define next frame->Xnext
572#define pp frame->Xpp
573#define prev frame->Xprev
574#define saved_eptr frame->Xsaved_eptr
575
576#define new_recursive frame->Xnew_recursive
577
578#define cur_is_word frame->Xcur_is_word
579#define condition frame->Xcondition
580#define prev_is_word frame->Xprev_is_word
581
582#ifdef SUPPORT_UCP
583#define prop_type frame->Xprop_type
584#define prop_value frame->Xprop_value
585#define prop_fail_result frame->Xprop_fail_result
586#define oclength frame->Xoclength
587#define occhars frame->Xocchars
588#endif
589
590#define ctype frame->Xctype
591#define fc frame->Xfc
592#define fi frame->Xfi
593#define length frame->Xlength
594#define max frame->Xmax
595#define min frame->Xmin
596#define number frame->Xnumber
597#define offset frame->Xoffset
598#define op frame->Xop
599#define save_capture_last frame->Xsave_capture_last
600#define save_offset1 frame->Xsave_offset1
601#define save_offset2 frame->Xsave_offset2
602#define save_offset3 frame->Xsave_offset3
603#define stacksave frame->Xstacksave
604
605#define newptrb frame->Xnewptrb
606
607/* When recursion is being used, local variables are allocated on the stack and
608get preserved during recursion in the normal way. In this environment, fi and
609i, and fc and c, can be the same variables. */
610
611#else /* NO_RECURSE not defined */
612#define fi i
613#define fc c
614
615/* Many of the following variables are used only in small blocks of the code.
616My normal style of coding would have declared them within each of those blocks.
617However, in order to accommodate the version of this code that uses an external
618"stack" implemented on the heap, it is easier to declare them all here, so the
619declarations can be cut out in a block. The only declarations within blocks
620below are for variables that do not have to be preserved over a recursive call
621to RMATCH(). */
622
623#ifdef SUPPORT_UTF
624const pcre_uchar *charptr;
625#endif
626const pcre_uchar *callpat;
627const pcre_uchar *data;
628const pcre_uchar *next;
629PCRE_PUCHAR pp;
630const pcre_uchar *prev;
631PCRE_PUCHAR saved_eptr;
632
633recursion_info new_recursive;
634
635BOOL cur_is_word;
636BOOL condition;
637BOOL prev_is_word;
638
639#ifdef SUPPORT_UCP
640int prop_type;
641unsigned int prop_value;
642int prop_fail_result;
643int oclength;
644pcre_uchar occhars[6];
645#endif
646
647int codelink;
648int ctype;
649int length;
650int max;
651int min;
652unsigned int number;
653int offset;
654unsigned int op;
655pcre_int32 save_capture_last;
656int save_offset1, save_offset2, save_offset3;
657int stacksave[REC_STACK_SAVE_MAX];
658
659eptrblock newptrb;
660
661/* There is a special fudge for calling match() in a way that causes it to
662measure the size of its basic stack frame when the stack is being used for
663recursion. The second argument (ecode) being NULL triggers this behaviour. It
664cannot normally ever be NULL. The return is the negated value of the frame
665size. */
666
667if (ecode == NULL)
668 {
669 if (rdepth == 0)
670 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
671 else
672 {
673 int len = (int)((char *)&rdepth - (char *)eptr);
674 return (len > 0)? -len : len;
675 }
676 }
677#endif /* NO_RECURSE */
678
679/* To save space on the stack and in the heap frame, I have doubled up on some
680of the local variables that are used only in localised parts of the code, but
681still need to be preserved over recursive calls of match(). These macros define
682the alternative names that are used. */
683
684#define allow_zero cur_is_word
685#define cbegroup condition
686#define code_offset codelink
687#define condassert condition
688#define matched_once prev_is_word
689#define foc number
690#define save_mark data
691
692/* These statements are here to stop the compiler complaining about unitialized
693variables. */
694
695#ifdef SUPPORT_UCP
696prop_value = 0;
697prop_fail_result = 0;
698#endif
699
700
701/* This label is used for tail recursion, which is used in a few cases even
702when NO_RECURSE is not defined, in order to reduce the amount of stack that is
703used. Thanks to Ian Taylor for noticing this possibility and sending the
704original patch. */
705
706TAIL_RECURSE:
707
708/* OK, now we can get on with the real code of the function. Recursive calls
709are specified by the macro RMATCH and RRETURN is used to return. When
710NO_RECURSE is *not* defined, these just turn into a recursive call to match()
711and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
712defined). However, RMATCH isn't like a function call because it's quite a
713complicated macro. It has to be used in one particular way. This shouldn't,
714however, impact performance when true recursion is being used. */
715
716#ifdef SUPPORT_UTF
717utf = md->utf; /* Local copy of the flag */
718#else
719utf = FALSE;
720#endif
721
722/* First check that we haven't called match() too many times, or that we
723haven't exceeded the recursive call limit. */
724
725if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
726if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
727
728/* At the start of a group with an unlimited repeat that may match an empty
729string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
730done this way to save having to use another function argument, which would take
731up space on the stack. See also MATCH_CONDASSERT below.
732
733When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
734such remembered pointers, to be checked when we hit the closing ket, in order
735to break infinite loops that match no characters. When match() is called in
736other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
737NOT be used with tail recursion, because the memory block that is used is on
738the stack, so a new one may be required for each match(). */
739
740if (md->match_function_type == MATCH_CBEGROUP)
741 {
742 newptrb.epb_saved_eptr = eptr;
743 newptrb.epb_prev = eptrb;
744 eptrb = &newptrb;
745 md->match_function_type = 0;
746 }
747
748/* Now start processing the opcodes. */
749
750for (;;)
751 {
752 minimize = possessive = FALSE;
753 op = *ecode;
754
755 switch(op)
756 {
757 case OP_MARK:
758 md->nomatch_mark = ecode + 2;
759 md->mark = NULL; /* In case previously set by assertion */
760 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
761 eptrb, RM55);
762 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
763 md->mark == NULL) md->mark = ecode + 2;
764
765 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
766 argument, and we must check whether that argument matches this MARK's
767 argument. It is passed back in md->start_match_ptr (an overloading of that
768 variable). If it does match, we reset that variable to the current subject
769 position and return MATCH_SKIP. Otherwise, pass back the return code
770 unaltered. */
771
772 else if (rrc == MATCH_SKIP_ARG &&
773 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
774 {
775 md->start_match_ptr = eptr;
776 RRETURN(MATCH_SKIP);
777 }
778 RRETURN(rrc);
779
780 case OP_FAIL:
781 RRETURN(MATCH_NOMATCH);
782
783 case OP_COMMIT:
784 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
785 eptrb, RM52);
786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
787 RRETURN(MATCH_COMMIT);
788
789 case OP_PRUNE:
790 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
791 eptrb, RM51);
792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
793 RRETURN(MATCH_PRUNE);
794
795 case OP_PRUNE_ARG:
796 md->nomatch_mark = ecode + 2;
797 md->mark = NULL; /* In case previously set by assertion */
798 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
799 eptrb, RM56);
800 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
801 md->mark == NULL) md->mark = ecode + 2;
802 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
803 RRETURN(MATCH_PRUNE);
804
805 case OP_SKIP:
806 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
807 eptrb, RM53);
808 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
809 md->start_match_ptr = eptr; /* Pass back current position */
810 RRETURN(MATCH_SKIP);
811
812 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
813 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
814 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
815 that failed and any that precede it (either they also failed, or were not
816 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
817 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
818 set to the count of the one that failed. */
819
820 case OP_SKIP_ARG:
821 md->skip_arg_count++;
822 if (md->skip_arg_count <= md->ignore_skip_arg)
823 {
824 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
825 break;
826 }
827 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
828 eptrb, RM57);
829 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
830
831 /* Pass back the current skip name by overloading md->start_match_ptr and
832 returning the special MATCH_SKIP_ARG return code. This will either be
833 caught by a matching MARK, or get to the top, where it causes a rematch
834 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
835
836 md->start_match_ptr = ecode + 2;
837 RRETURN(MATCH_SKIP_ARG);
838
839 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
840 the branch in which it occurs can be determined. Overload the start of
841 match pointer to do this. */
842
843 case OP_THEN:
844 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
845 eptrb, RM54);
846 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
847 md->start_match_ptr = ecode;
848 RRETURN(MATCH_THEN);
849
850 case OP_THEN_ARG:
851 md->nomatch_mark = ecode + 2;
852 md->mark = NULL; /* In case previously set by assertion */
853 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
854 md, eptrb, RM58);
855 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
856 md->mark == NULL) md->mark = ecode + 2;
857 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
858 md->start_match_ptr = ecode;
859 RRETURN(MATCH_THEN);
860
861 /* Handle an atomic group that does not contain any capturing parentheses.
862 This can be handled like an assertion. Prior to 8.13, all atomic groups
863 were handled this way. In 8.13, the code was changed as below for ONCE, so
864 that backups pass through the group and thereby reset captured values.
865 However, this uses a lot more stack, so in 8.20, atomic groups that do not
866 contain any captures generate OP_ONCE_NC, which can be handled in the old,
867 less stack intensive way.
868
869 Check the alternative branches in turn - the matching won't pass the KET
870 for this kind of subpattern. If any one branch matches, we carry on as at
871 the end of a normal bracket, leaving the subject pointer, but resetting
872 the start-of-match value in case it was changed by \K. */
873
874 case OP_ONCE_NC:
875 prev = ecode;
876 saved_eptr = eptr;
877 save_mark = md->mark;
878 do
879 {
880 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
881 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
882 {
883 mstart = md->start_match_ptr;
884 break;
885 }
886 if (rrc == MATCH_THEN)
887 {
888 next = ecode + GET(ecode,1);
889 if (md->start_match_ptr < next &&
890 (*ecode == OP_ALT || *next == OP_ALT))
891 rrc = MATCH_NOMATCH;
892 }
893
894 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
895 ecode += GET(ecode,1);
896 md->mark = save_mark;
897 }
898 while (*ecode == OP_ALT);
899
900 /* If hit the end of the group (which could be repeated), fail */
901
902 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
903
904 /* Continue as from after the group, updating the offsets high water
905 mark, since extracts may have been taken. */
906
907 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
908
909 offset_top = md->end_offset_top;
910 eptr = md->end_match_ptr;
911
912 /* For a non-repeating ket, just continue at this level. This also
913 happens for a repeating ket if no characters were matched in the group.
914 This is the forcible breaking of infinite loops as implemented in Perl
915 5.005. */
916
917 if (*ecode == OP_KET || eptr == saved_eptr)
918 {
919 ecode += 1+LINK_SIZE;
920 break;
921 }
922
923 /* The repeating kets try the rest of the pattern or restart from the
924 preceding bracket, in the appropriate order. The second "call" of match()
925 uses tail recursion, to avoid using another stack frame. */
926
927 if (*ecode == OP_KETRMIN)
928 {
929 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
930 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
931 ecode = prev;
932 goto TAIL_RECURSE;
933 }
934 else /* OP_KETRMAX */
935 {
936 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
937 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
938 ecode += 1 + LINK_SIZE;
939 goto TAIL_RECURSE;
940 }
941 /* Control never gets here */
942
943 /* Handle a capturing bracket, other than those that are possessive with an
944 unlimited repeat. If there is space in the offset vector, save the current
945 subject position in the working slot at the top of the vector. We mustn't
946 change the current values of the data slot, because they may be set from a
947 previous iteration of this group, and be referred to by a reference inside
948 the group. A failure to match might occur after the group has succeeded,
949 if something later on doesn't match. For this reason, we need to restore
950 the working value and also the values of the final offsets, in case they
951 were set by a previous iteration of the same bracket.
952
953 If there isn't enough space in the offset vector, treat this as if it were
954 a non-capturing bracket. Don't worry about setting the flag for the error
955 case here; that is handled in the code for KET. */
956
957 case OP_CBRA:
958 case OP_SCBRA:
959 number = GET2(ecode, 1+LINK_SIZE);
960 offset = number << 1;
961
962#ifdef PCRE_DEBUG
963 printf("start bracket %d\n", number);
964 printf("subject=");
965 pchars(eptr, 16, TRUE, md);
966 printf("\n");
967#endif
968
969 if (offset < md->offset_max)
970 {
971 save_offset1 = md->offset_vector[offset];
972 save_offset2 = md->offset_vector[offset+1];
973 save_offset3 = md->offset_vector[md->offset_end - number];
974 save_capture_last = md->capture_last;
975 save_mark = md->mark;
976
977 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
978 md->offset_vector[md->offset_end - number] =
979 (int)(eptr - md->start_subject);
980
981 for (;;)
982 {
983 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
984 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
985 eptrb, RM1);
986 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
987
988 /* If we backed up to a THEN, check whether it is within the current
989 branch by comparing the address of the THEN that is passed back with
990 the end of the branch. If it is within the current branch, and the
991 branch is one of two or more alternatives (it either starts or ends
992 with OP_ALT), we have reached the limit of THEN's action, so convert
993 the return code to NOMATCH, which will cause normal backtracking to
994 happen from now on. Otherwise, THEN is passed back to an outer
995 alternative. This implements Perl's treatment of parenthesized groups,
996 where a group not containing | does not affect the current alternative,
997 that is, (X) is NOT the same as (X|(*F)). */
998
999 if (rrc == MATCH_THEN)
1000 {
1001 next = ecode + GET(ecode,1);
1002 if (md->start_match_ptr < next &&
1003 (*ecode == OP_ALT || *next == OP_ALT))
1004 rrc = MATCH_NOMATCH;
1005 }
1006
1007 /* Anything other than NOMATCH is passed back. */
1008
1009 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1010 md->capture_last = save_capture_last;
1011 ecode += GET(ecode, 1);
1012 md->mark = save_mark;
1013 if (*ecode != OP_ALT) break;
1014 }
1015
1016 DPRINTF(("bracket %d failed\n", number));
1017 md->offset_vector[offset] = save_offset1;
1018 md->offset_vector[offset+1] = save_offset2;
1019 md->offset_vector[md->offset_end - number] = save_offset3;
1020
1021 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1022
1023 RRETURN(rrc);
1024 }
1025
1026 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1027 as a non-capturing bracket. */
1028
1029 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1030 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1031
1032 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1033
1034 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1035 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1036
1037 /* Non-capturing or atomic group, except for possessive with unlimited
1038 repeat and ONCE group with no captures. Loop for all the alternatives.
1039
1040 When we get to the final alternative within the brackets, we used to return
1041 the result of a recursive call to match() whatever happened so it was
1042 possible to reduce stack usage by turning this into a tail recursion,
1043 except in the case of a possibly empty group. However, now that there is
1044 the possiblity of (*THEN) occurring in the final alternative, this
1045 optimization is no longer always possible.
1046
1047 We can optimize if we know there are no (*THEN)s in the pattern; at present
1048 this is the best that can be done.
1049
1050 MATCH_ONCE is returned when the end of an atomic group is successfully
1051 reached, but subsequent matching fails. It passes back up the tree (causing
1052 captured values to be reset) until the original atomic group level is
1053 reached. This is tested by comparing md->once_target with the start of the
1054 group. At this point, the return is converted into MATCH_NOMATCH so that
1055 previous backup points can be taken. */
1056
1057 case OP_ONCE:
1058 case OP_BRA:
1059 case OP_SBRA:
1060 DPRINTF(("start non-capturing bracket\n"));
1061
1062 for (;;)
1063 {
1064 if (op >= OP_SBRA || op == OP_ONCE)
1065 md->match_function_type = MATCH_CBEGROUP;
1066
1067 /* If this is not a possibly empty group, and there are no (*THEN)s in
1068 the pattern, and this is the final alternative, optimize as described
1069 above. */
1070
1071 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1072 {
1073 ecode += PRIV(OP_lengths)[*ecode];
1074 goto TAIL_RECURSE;
1075 }
1076
1077 /* In all other cases, we have to make another call to match(). */
1078
1079 save_mark = md->mark;
1080 save_capture_last = md->capture_last;
1081 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1082 RM2);
1083
1084 /* See comment in the code for capturing groups above about handling
1085 THEN. */
1086
1087 if (rrc == MATCH_THEN)
1088 {
1089 next = ecode + GET(ecode,1);
1090 if (md->start_match_ptr < next &&
1091 (*ecode == OP_ALT || *next == OP_ALT))
1092 rrc = MATCH_NOMATCH;
1093 }
1094
1095 if (rrc != MATCH_NOMATCH)
1096 {
1097 if (rrc == MATCH_ONCE)
1098 {
1099 const pcre_uchar *scode = ecode;
1100 if (*scode != OP_ONCE) /* If not at start, find it */
1101 {
1102 while (*scode == OP_ALT) scode += GET(scode, 1);
1103 scode -= GET(scode, 1);
1104 }
1105 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1106 }
1107 RRETURN(rrc);
1108 }
1109 ecode += GET(ecode, 1);
1110 md->mark = save_mark;
1111 if (*ecode != OP_ALT) break;
1112 md->capture_last = save_capture_last;
1113 }
1114
1115 RRETURN(MATCH_NOMATCH);
1116
1117 /* Handle possessive capturing brackets with an unlimited repeat. We come
1118 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1119 handled similarly to the normal case above. However, the matching is
1120 different. The end of these brackets will always be OP_KETRPOS, which
1121 returns MATCH_KETRPOS without going further in the pattern. By this means
1122 we can handle the group by iteration rather than recursion, thereby
1123 reducing the amount of stack needed. */
1124
1125 case OP_CBRAPOS:
1126 case OP_SCBRAPOS:
1127 allow_zero = FALSE;
1128
1129 POSSESSIVE_CAPTURE:
1130 number = GET2(ecode, 1+LINK_SIZE);
1131 offset = number << 1;
1132
1133#ifdef PCRE_DEBUG
1134 printf("start possessive bracket %d\n", number);
1135 printf("subject=");
1136 pchars(eptr, 16, TRUE, md);
1137 printf("\n");
1138#endif
1139
1140 if (offset >= md->offset_max) goto POSSESSIVE_NON_CAPTURE;
1141
1142 matched_once = FALSE;
1143 code_offset = (int)(ecode - md->start_code);
1144
1145 save_offset1 = md->offset_vector[offset];
1146 save_offset2 = md->offset_vector[offset+1];
1147 save_offset3 = md->offset_vector[md->offset_end - number];
1148 save_capture_last = md->capture_last;
1149
1150 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1151
1152 /* Each time round the loop, save the current subject position for use
1153 when the group matches. For MATCH_MATCH, the group has matched, so we
1154 restart it with a new subject starting position, remembering that we had
1155 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1156 usual. If we haven't matched any alternatives in any iteration, check to
1157 see if a previous iteration matched. If so, the group has matched;
1158 continue from afterwards. Otherwise it has failed; restore the previous
1159 capture values before returning NOMATCH. */
1160
1161 for (;;)
1162 {
1163 md->offset_vector[md->offset_end - number] =
1164 (int)(eptr - md->start_subject);
1165 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1166 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1167 eptrb, RM63);
1168 if (rrc == MATCH_KETRPOS)
1169 {
1170 offset_top = md->end_offset_top;
1171 ecode = md->start_code + code_offset;
1172 save_capture_last = md->capture_last;
1173 matched_once = TRUE;
1174 mstart = md->start_match_ptr; /* In case \K changed it */
1175 if (eptr == md->end_match_ptr) /* Matched an empty string */
1176 {
1177 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1178 break;
1179 }
1180 eptr = md->end_match_ptr;
1181 continue;
1182 }
1183
1184 /* See comment in the code for capturing groups above about handling
1185 THEN. */
1186
1187 if (rrc == MATCH_THEN)
1188 {
1189 next = ecode + GET(ecode,1);
1190 if (md->start_match_ptr < next &&
1191 (*ecode == OP_ALT || *next == OP_ALT))
1192 rrc = MATCH_NOMATCH;
1193 }
1194
1195 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1196 md->capture_last = save_capture_last;
1197 ecode += GET(ecode, 1);
1198 if (*ecode != OP_ALT) break;
1199 }
1200
1201 if (!matched_once)
1202 {
1203 md->offset_vector[offset] = save_offset1;
1204 md->offset_vector[offset+1] = save_offset2;
1205 md->offset_vector[md->offset_end - number] = save_offset3;
1206 }
1207
1208 if (allow_zero || matched_once)
1209 {
1210 ecode += 1 + LINK_SIZE;
1211 break;
1212 }
1213
1214 RRETURN(MATCH_NOMATCH);
1215
1216 /* Non-capturing possessive bracket with unlimited repeat. We come here
1217 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1218 without the capturing complication. It is written out separately for speed
1219 and cleanliness. */
1220
1221 case OP_BRAPOS:
1222 case OP_SBRAPOS:
1223 allow_zero = FALSE;
1224
1225 POSSESSIVE_NON_CAPTURE:
1226 matched_once = FALSE;
1227 code_offset = (int)(ecode - md->start_code);
1228 save_capture_last = md->capture_last;
1229
1230 for (;;)
1231 {
1232 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1233 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1234 eptrb, RM48);
1235 if (rrc == MATCH_KETRPOS)
1236 {
1237 offset_top = md->end_offset_top;
1238 ecode = md->start_code + code_offset;
1239 matched_once = TRUE;
1240 mstart = md->start_match_ptr; /* In case \K reset it */
1241 if (eptr == md->end_match_ptr) /* Matched an empty string */
1242 {
1243 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1244 break;
1245 }
1246 eptr = md->end_match_ptr;
1247 continue;
1248 }
1249
1250 /* See comment in the code for capturing groups above about handling
1251 THEN. */
1252
1253 if (rrc == MATCH_THEN)
1254 {
1255 next = ecode + GET(ecode,1);
1256 if (md->start_match_ptr < next &&
1257 (*ecode == OP_ALT || *next == OP_ALT))
1258 rrc = MATCH_NOMATCH;
1259 }
1260
1261 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1262 ecode += GET(ecode, 1);
1263 if (*ecode != OP_ALT) break;
1264 md->capture_last = save_capture_last;
1265 }
1266
1267 if (matched_once || allow_zero)
1268 {
1269 ecode += 1 + LINK_SIZE;
1270 break;
1271 }
1272 RRETURN(MATCH_NOMATCH);
1273
1274 /* Control never reaches here. */
1275
1276 /* Conditional group: compilation checked that there are no more than two
1277 branches. If the condition is false, skipping the first branch takes us
1278 past the end of the item if there is only one branch, but that's exactly
1279 what we want. */
1280
1281 case OP_COND:
1282 case OP_SCOND:
1283
1284 /* The variable codelink will be added to ecode when the condition is
1285 false, to get to the second branch. Setting it to the offset to the ALT
1286 or KET, then incrementing ecode achieves this effect. We now have ecode
1287 pointing to the condition or callout. */
1288
1289 codelink = GET(ecode, 1); /* Offset to the second branch */
1290 ecode += 1 + LINK_SIZE; /* From this opcode */
1291
1292 /* Because of the way auto-callout works during compile, a callout item is
1293 inserted between OP_COND and an assertion condition. */
1294
1295 if (*ecode == OP_CALLOUT)
1296 {
1297 if (PUBL(callout) != NULL)
1298 {
1299 PUBL(callout_block) cb;
1300 cb.version = 2; /* Version 1 of the callout block */
1301 cb.callout_number = ecode[1];
1302 cb.offset_vector = md->offset_vector;
1303#if defined COMPILE_PCRE8
1304 cb.subject = (PCRE_SPTR)md->start_subject;
1305#elif defined COMPILE_PCRE16
1306 cb.subject = (PCRE_SPTR16)md->start_subject;
1307#elif defined COMPILE_PCRE32
1308 cb.subject = (PCRE_SPTR32)md->start_subject;
1309#endif
1310 cb.subject_length = (int)(md->end_subject - md->start_subject);
1311 cb.start_match = (int)(mstart - md->start_subject);
1312 cb.current_position = (int)(eptr - md->start_subject);
1313 cb.pattern_position = GET(ecode, 2);
1314 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1315 cb.capture_top = offset_top/2;
1316 cb.capture_last = md->capture_last & CAPLMASK;
1317 /* Internal change requires this for API compatibility. */
1318 if (cb.capture_last == 0) cb.capture_last = -1;
1319 cb.callout_data = md->callout_data;
1320 cb.mark = md->nomatch_mark;
1321 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1322 if (rrc < 0) RRETURN(rrc);
1323 }
1324
1325 /* Advance ecode past the callout, so it now points to the condition. We
1326 must adjust codelink so that the value of ecode+codelink is unchanged. */
1327
1328 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1329 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1330 }
1331
1332 /* Test the various possible conditions */
1333
1334 condition = FALSE;
1335 switch(condcode = *ecode)
1336 {
1337 case OP_RREF: /* Numbered group recursion test */
1338 if (md->recursive != NULL) /* Not recursing => FALSE */
1339 {
1340 unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
1341 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1342 }
1343 break;
1344
1345 case OP_DNRREF: /* Duplicate named group recursion test */
1346 if (md->recursive != NULL)
1347 {
1348 int count = GET2(ecode, 1 + IMM2_SIZE);
1349 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1350 while (count-- > 0)
1351 {
1352 unsigned int recno = GET2(slot, 0);
1353 condition = recno == md->recursive->group_num;
1354 if (condition) break;
1355 slot += md->name_entry_size;
1356 }
1357 }
1358 break;
1359
1360 case OP_CREF: /* Numbered group used test */
1361 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1362 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1363 break;
1364
1365 case OP_DNCREF: /* Duplicate named group used test */
1366 {
1367 int count = GET2(ecode, 1 + IMM2_SIZE);
1368 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1369 while (count-- > 0)
1370 {
1371 offset = GET2(slot, 0) << 1;
1372 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1373 if (condition) break;
1374 slot += md->name_entry_size;
1375 }
1376 }
1377 break;
1378
1379 case OP_DEF: /* DEFINE - always false */
1380 case OP_FAIL: /* From optimized (?!) condition */
1381 break;
1382
1383 /* The condition is an assertion. Call match() to evaluate it - setting
1384 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
1385 of an assertion. */
1386
1387 default:
1388 md->match_function_type = MATCH_CONDASSERT;
1389 RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
1390 if (rrc == MATCH_MATCH)
1391 {
1392 if (md->end_offset_top > offset_top)
1393 offset_top = md->end_offset_top; /* Captures may have happened */
1394 condition = TRUE;
1395
1396 /* Advance ecode past the assertion to the start of the first branch,
1397 but adjust it so that the general choosing code below works. If the
1398 assertion has a quantifier that allows zero repeats we must skip over
1399 the BRAZERO. This is a lunatic thing to do, but somebody did! */
1400
1401 if (*ecode == OP_BRAZERO) ecode++;
1402 ecode += GET(ecode, 1);
1403 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1404 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
1405 }
1406
1407 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1408 assertion; it is therefore treated as NOMATCH. Any other return is an
1409 error. */
1410
1411 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1412 {
1413 RRETURN(rrc); /* Need braces because of following else */
1414 }
1415 break;
1416 }
1417
1418 /* Choose branch according to the condition */
1419
1420 ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
1421
1422 /* We are now at the branch that is to be obeyed. As there is only one, we
1423 can use tail recursion to avoid using another stack frame, except when
1424 there is unlimited repeat of a possibly empty group. In the latter case, a
1425 recursive call to match() is always required, unless the second alternative
1426 doesn't exist, in which case we can just plough on. Note that, for
1427 compatibility with Perl, the | in a conditional group is NOT treated as
1428 creating two alternatives. If a THEN is encountered in the branch, it
1429 propagates out to the enclosing alternative (unless nested in a deeper set
1430 of alternatives, of course). */
1431
1432 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
1433 {
1434 if (op != OP_SCOND)
1435 {
1436 goto TAIL_RECURSE;
1437 }
1438
1439 md->match_function_type = MATCH_CBEGROUP;
1440 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1441 RRETURN(rrc);
1442 }
1443
1444 /* Condition false & no alternative; continue after the group. */
1445
1446 else
1447 {
1448 }
1449 break;
1450
1451
1452 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1453 to close any currently open capturing brackets. */
1454
1455 case OP_CLOSE:
1456 number = GET2(ecode, 1); /* Must be less than 65536 */
1457 offset = number << 1;
1458
1459#ifdef PCRE_DEBUG
1460 printf("end bracket %d at *ACCEPT", number);
1461 printf("\n");
1462#endif
1463
1464 md->capture_last = (md->capture_last & OVFLMASK) | number;
1465 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1466 {
1467 md->offset_vector[offset] =
1468 md->offset_vector[md->offset_end - number];
1469 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1470
1471 /* If this group is at or above the current highwater mark, ensure that
1472 any groups between the current high water mark and this group are marked
1473 unset and then update the high water mark. */
1474
1475 if (offset >= offset_top)
1476 {
1477 register int *iptr = md->offset_vector + offset_top;
1478 register int *iend = md->offset_vector + offset;
1479 while (iptr < iend) *iptr++ = -1;
1480 offset_top = offset + 2;
1481 }
1482 }
1483 ecode += 1 + IMM2_SIZE;
1484 break;
1485
1486
1487 /* End of the pattern, either real or forced. */
1488
1489 case OP_END:
1490 case OP_ACCEPT:
1491 case OP_ASSERT_ACCEPT:
1492
1493 /* If we have matched an empty string, fail if not in an assertion and not
1494 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1495 is set and we have matched at the start of the subject. In both cases,
1496 backtracking will then try other alternatives, if any. */
1497
1498 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1499 md->recursive == NULL &&
1500 (md->notempty ||
1501 (md->notempty_atstart &&
1502 mstart == md->start_subject + md->start_offset)))
1503 RRETURN(MATCH_NOMATCH);
1504
1505 /* Otherwise, we have a match. */
1506
1507 md->end_match_ptr = eptr; /* Record where we ended */
1508 md->end_offset_top = offset_top; /* and how many extracts were taken */
1509 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1510
1511 /* For some reason, the macros don't work properly if an expression is
1512 given as the argument to RRETURN when the heap is in use. */
1513
1514 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1515 RRETURN(rrc);
1516
1517 /* Assertion brackets. Check the alternative branches in turn - the
1518 matching won't pass the KET for an assertion. If any one branch matches,
1519 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1520 start of each branch to move the current point backwards, so the code at
1521 this level is identical to the lookahead case. When the assertion is part
1522 of a condition, we want to return immediately afterwards. The caller of
1523 this incarnation of the match() function will have set MATCH_CONDASSERT in
1524 md->match_function type, and one of these opcodes will be the first opcode
1525 that is processed. We use a local variable that is preserved over calls to
1526 match() to remember this case. */
1527
1528 case OP_ASSERT:
1529 case OP_ASSERTBACK:
1530 save_mark = md->mark;
1531 if (md->match_function_type == MATCH_CONDASSERT)
1532 {
1533 condassert = TRUE;
1534 md->match_function_type = 0;
1535 }
1536 else condassert = FALSE;
1537
1538 /* Loop for each branch */
1539
1540 do
1541 {
1542 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1543
1544 /* A match means that the assertion is true; break out of the loop
1545 that matches its alternatives. */
1546
1547 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1548 {
1549 mstart = md->start_match_ptr; /* In case \K reset it */
1550 break;
1551 }
1552
1553 /* If not matched, restore the previous mark setting. */
1554
1555 md->mark = save_mark;
1556
1557 /* See comment in the code for capturing groups above about handling
1558 THEN. */
1559
1560 if (rrc == MATCH_THEN)
1561 {
1562 next = ecode + GET(ecode,1);
1563 if (md->start_match_ptr < next &&
1564 (*ecode == OP_ALT || *next == OP_ALT))
1565 rrc = MATCH_NOMATCH;
1566 }
1567
1568 /* Anything other than NOMATCH causes the entire assertion to fail,
1569 passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1570 uncaptured THEN, which means they take their normal effect. This
1571 consistent approach does not always have exactly the same effect as in
1572 Perl. */
1573
1574 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1575 ecode += GET(ecode, 1);
1576 }
1577 while (*ecode == OP_ALT); /* Continue for next alternative */
1578
1579 /* If we have tried all the alternative branches, the assertion has
1580 failed. If not, we broke out after a match. */
1581
1582 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1583
1584 /* If checking an assertion for a condition, return MATCH_MATCH. */
1585
1586 if (condassert) RRETURN(MATCH_MATCH);
1587
1588 /* Continue from after a successful assertion, updating the offsets high
1589 water mark, since extracts may have been taken during the assertion. */
1590
1591 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1592 ecode += 1 + LINK_SIZE;
1593 offset_top = md->end_offset_top;
1594 continue;
1595
1596 /* Negative assertion: all branches must fail to match for the assertion to
1597 succeed. */
1598
1599 case OP_ASSERT_NOT:
1600 case OP_ASSERTBACK_NOT:
1601 save_mark = md->mark;
1602 if (md->match_function_type == MATCH_CONDASSERT)
1603 {
1604 condassert = TRUE;
1605 md->match_function_type = 0;
1606 }
1607 else condassert = FALSE;
1608
1609 /* Loop for each alternative branch. */
1610
1611 do
1612 {
1613 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1614 md->mark = save_mark; /* Always restore the mark setting */
1615
1616 switch(rrc)
1617 {
1618 case MATCH_MATCH: /* A successful match means */
1619 case MATCH_ACCEPT: /* the assertion has failed. */
1620 RRETURN(MATCH_NOMATCH);
1621
1622 case MATCH_NOMATCH: /* Carry on with next branch */
1623 break;
1624
1625 /* See comment in the code for capturing groups above about handling
1626 THEN. */
1627
1628 case MATCH_THEN:
1629 next = ecode + GET(ecode,1);
1630 if (md->start_match_ptr < next &&
1631 (*ecode == OP_ALT || *next == OP_ALT))
1632 {
1633 rrc = MATCH_NOMATCH;
1634 break;
1635 }
1636 /* Otherwise fall through. */
1637
1638 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1639 assertion to fail to match, without considering any more alternatives.
1640 Failing to match means the assertion is true. This is a consistent
1641 approach, but does not always have the same effect as in Perl. */
1642
1643 case MATCH_COMMIT:
1644 case MATCH_SKIP:
1645 case MATCH_SKIP_ARG:
1646 case MATCH_PRUNE:
1647 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1648 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1649
1650 /* Anything else is an error */
1651
1652 default:
1653 RRETURN(rrc);
1654 }
1655
1656 /* Continue with next branch */
1657
1658 ecode += GET(ecode,1);
1659 }
1660 while (*ecode == OP_ALT);
1661
1662 /* All branches in the assertion failed to match. */
1663
1664 NEG_ASSERT_TRUE:
1665 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1666 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1667 continue;
1668
1669 /* Move the subject pointer back. This occurs only at the start of
1670 each branch of a lookbehind assertion. If we are too close to the start to
1671 move back, this match function fails. When working with UTF-8 we move
1672 back a number of characters, not bytes. */
1673
1674 case OP_REVERSE:
1675#ifdef SUPPORT_UTF
1676 if (utf)
1677 {
1678 i = GET(ecode, 1);
1679 while (i-- > 0)
1680 {
1681 eptr--;
1682 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1683 BACKCHAR(eptr);
1684 }
1685 }
1686 else
1687#endif
1688
1689 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1690
1691 {
1692 eptr -= GET(ecode, 1);
1693 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1694 }
1695
1696 /* Save the earliest consulted character, then skip to next op code */
1697
1698 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1699 ecode += 1 + LINK_SIZE;
1700 break;
1701
1702 /* The callout item calls an external function, if one is provided, passing
1703 details of the match so far. This is mainly for debugging, though the
1704 function is able to force a failure. */
1705
1706 case OP_CALLOUT:
1707 if (PUBL(callout) != NULL)
1708 {
1709 PUBL(callout_block) cb;
1710 cb.version = 2; /* Version 1 of the callout block */
1711 cb.callout_number = ecode[1];
1712 cb.offset_vector = md->offset_vector;
1713#if defined COMPILE_PCRE8
1714 cb.subject = (PCRE_SPTR)md->start_subject;
1715#elif defined COMPILE_PCRE16
1716 cb.subject = (PCRE_SPTR16)md->start_subject;
1717#elif defined COMPILE_PCRE32
1718 cb.subject = (PCRE_SPTR32)md->start_subject;
1719#endif
1720 cb.subject_length = (int)(md->end_subject - md->start_subject);
1721 cb.start_match = (int)(mstart - md->start_subject);
1722 cb.current_position = (int)(eptr - md->start_subject);
1723 cb.pattern_position = GET(ecode, 2);
1724 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1725 cb.capture_top = offset_top/2;
1726 cb.capture_last = md->capture_last & CAPLMASK;
1727 /* Internal change requires this for API compatibility. */
1728 if (cb.capture_last == 0) cb.capture_last = -1;
1729 cb.callout_data = md->callout_data;
1730 cb.mark = md->nomatch_mark;
1731 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1732 if (rrc < 0) RRETURN(rrc);
1733 }
1734 ecode += 2 + 2*LINK_SIZE;
1735 break;
1736
1737 /* Recursion either matches the current regex, or some subexpression. The
1738 offset data is the offset to the starting bracket from the start of the
1739 whole pattern. (This is so that it works from duplicated subpatterns.)
1740
1741 The state of the capturing groups is preserved over recursion, and
1742 re-instated afterwards. We don't know how many are started and not yet
1743 finished (offset_top records the completed total) so we just have to save
1744 all the potential data. There may be up to 65535 such values, which is too
1745 large to put on the stack, but using malloc for small numbers seems
1746 expensive. As a compromise, the stack is used when there are no more than
1747 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1748
1749 There are also other values that have to be saved. We use a chained
1750 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1751 for the original version of this logic. It has, however, been hacked around
1752 a lot, so he is not to blame for the current way it works. */
1753
1754 case OP_RECURSE:
1755 {
1756 recursion_info *ri;
1757 unsigned int recno;
1758
1759 callpat = md->start_code + GET(ecode, 1);
1760 recno = (callpat == md->start_code)? 0 :
1761 GET2(callpat, 1 + LINK_SIZE);
1762
1763 /* Check for repeating a recursion without advancing the subject pointer.
1764 This should catch convoluted mutual recursions. (Some simple cases are
1765 caught at compile time.) */
1766
1767 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1768 if (recno == ri->group_num && eptr == ri->subject_position)
1769 RRETURN(PCRE_ERROR_RECURSELOOP);
1770
1771 /* Add to "recursing stack" */
1772
1773 new_recursive.group_num = recno;
1774 new_recursive.saved_capture_last = md->capture_last;
1775 new_recursive.subject_position = eptr;
1776 new_recursive.prevrec = md->recursive;
1777 md->recursive = &new_recursive;
1778
1779 /* Where to continue from afterwards */
1780
1781 ecode += 1 + LINK_SIZE;
1782
1783 /* Now save the offset data */
1784
1785 new_recursive.saved_max = md->offset_end;
1786 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1787 new_recursive.offset_save = stacksave;
1788 else
1789 {
1790 new_recursive.offset_save =
1791 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1792 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1793 }
1794 memcpy(new_recursive.offset_save, md->offset_vector,
1795 new_recursive.saved_max * sizeof(int));
1796
1797 /* OK, now we can do the recursion. After processing each alternative,
1798 restore the offset data and the last captured value. If there were nested
1799 recursions, md->recursive might be changed, so reset it before looping.
1800 */
1801
1802 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1803 cbegroup = (*callpat >= OP_SBRA);
1804 do
1805 {
1806 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1807 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1808 md, eptrb, RM6);
1809 memcpy(md->offset_vector, new_recursive.offset_save,
1810 new_recursive.saved_max * sizeof(int));
1811 md->capture_last = new_recursive.saved_capture_last;
1812 md->recursive = new_recursive.prevrec;
1813 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1814 {
1815 DPRINTF(("Recursion matched\n"));
1816 if (new_recursive.offset_save != stacksave)
1817 (PUBL(free))(new_recursive.offset_save);
1818
1819 /* Set where we got to in the subject, and reset the start in case
1820 it was changed by \K. This *is* propagated back out of a recursion,
1821 for Perl compatibility. */
1822
1823 eptr = md->end_match_ptr;
1824 mstart = md->start_match_ptr;
1825 goto RECURSION_MATCHED; /* Exit loop; end processing */
1826 }
1827
1828 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1829 recursion; they cause a NOMATCH for the entire recursion. These codes
1830 are defined in a range that can be tested for. */
1831
1832 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1833 {
1834 if (new_recursive.offset_save != stacksave)
1835 (PUBL(free))(new_recursive.offset_save);
1836 RRETURN(MATCH_NOMATCH);
1837 }
1838
1839 /* Any return code other than NOMATCH is an error. */
1840
1841 if (rrc != MATCH_NOMATCH)
1842 {
1843 DPRINTF(("Recursion gave error %d\n", rrc));
1844 if (new_recursive.offset_save != stacksave)
1845 (PUBL(free))(new_recursive.offset_save);
1846 RRETURN(rrc);
1847 }
1848
1849 md->recursive = &new_recursive;
1850 callpat += GET(callpat, 1);
1851 }
1852 while (*callpat == OP_ALT);
1853
1854 DPRINTF(("Recursion didn't match\n"));
1855 md->recursive = new_recursive.prevrec;
1856 if (new_recursive.offset_save != stacksave)
1857 (PUBL(free))(new_recursive.offset_save);
1858 RRETURN(MATCH_NOMATCH);
1859 }
1860
1861 RECURSION_MATCHED:
1862 break;
1863
1864 /* An alternation is the end of a branch; scan along to find the end of the
1865 bracketed group and go to there. */
1866
1867 case OP_ALT:
1868 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1869 break;
1870
1871 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1872 indicating that it may occur zero times. It may repeat infinitely, or not
1873 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1874 with fixed upper repeat limits are compiled as a number of copies, with the
1875 optional ones preceded by BRAZERO or BRAMINZERO. */
1876
1877 case OP_BRAZERO:
1878 next = ecode + 1;
1879 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1880 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1881 do next += GET(next, 1); while (*next == OP_ALT);
1882 ecode = next + 1 + LINK_SIZE;
1883 break;
1884
1885 case OP_BRAMINZERO:
1886 next = ecode + 1;
1887 do next += GET(next, 1); while (*next == OP_ALT);
1888 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1889 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1890 ecode++;
1891 break;
1892
1893 case OP_SKIPZERO:
1894 next = ecode+1;
1895 do next += GET(next,1); while (*next == OP_ALT);
1896 ecode = next + 1 + LINK_SIZE;
1897 break;
1898
1899 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1900 here; just jump to the group, with allow_zero set TRUE. */
1901
1902 case OP_BRAPOSZERO:
1903 op = *(++ecode);
1904 allow_zero = TRUE;
1905 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1906 goto POSSESSIVE_NON_CAPTURE;
1907
1908 /* End of a group, repeated or non-repeating. */
1909
1910 case OP_KET:
1911 case OP_KETRMIN:
1912 case OP_KETRMAX:
1913 case OP_KETRPOS:
1914 prev = ecode - GET(ecode, 1);
1915
1916 /* If this was a group that remembered the subject start, in order to break
1917 infinite repeats of empty string matches, retrieve the subject start from
1918 the chain. Otherwise, set it NULL. */
1919
1920 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1921 {
1922 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1923 eptrb = eptrb->epb_prev; /* Backup to previous group */
1924 }
1925 else saved_eptr = NULL;
1926
1927 /* If we are at the end of an assertion group or a non-capturing atomic
1928 group, stop matching and return MATCH_MATCH, but record the current high
1929 water mark for use by positive assertions. We also need to record the match
1930 start in case it was changed by \K. */
1931
1932 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1933 *prev == OP_ONCE_NC)
1934 {
1935 md->end_match_ptr = eptr; /* For ONCE_NC */
1936 md->end_offset_top = offset_top;
1937 md->start_match_ptr = mstart;
1938 RRETURN(MATCH_MATCH); /* Sets md->mark */
1939 }
1940
1941 /* For capturing groups we have to check the group number back at the start
1942 and if necessary complete handling an extraction by setting the offsets and
1943 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1944 into group 0, so it won't be picked up here. Instead, we catch it when the
1945 OP_END is reached. Other recursion is handled here. We just have to record
1946 the current subject position and start match pointer and give a MATCH
1947 return. */
1948
1949 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1950 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1951 {
1952 number = GET2(prev, 1+LINK_SIZE);
1953 offset = number << 1;
1954
1955#ifdef PCRE_DEBUG
1956 printf("end bracket %d", number);
1957 printf("\n");
1958#endif
1959
1960 /* Handle a recursively called group. */
1961
1962 if (md->recursive != NULL && md->recursive->group_num == number)
1963 {
1964 md->end_match_ptr = eptr;
1965 md->start_match_ptr = mstart;
1966 RRETURN(MATCH_MATCH);
1967 }
1968
1969 /* Deal with capturing */
1970
1971 md->capture_last = (md->capture_last & OVFLMASK) | number;
1972 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1973 {
1974 /* If offset is greater than offset_top, it means that we are
1975 "skipping" a capturing group, and that group's offsets must be marked
1976 unset. In earlier versions of PCRE, all the offsets were unset at the
1977 start of matching, but this doesn't work because atomic groups and
1978 assertions can cause a value to be set that should later be unset.
1979 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1980 part of the atomic group, but this is not on the final matching path,
1981 so must be unset when 2 is set. (If there is no group 2, there is no
1982 problem, because offset_top will then be 2, indicating no capture.) */
1983
1984 if (offset > offset_top)
1985 {
1986 register int *iptr = md->offset_vector + offset_top;
1987 register int *iend = md->offset_vector + offset;
1988 while (iptr < iend) *iptr++ = -1;
1989 }
1990
1991 /* Now make the extraction */
1992
1993 md->offset_vector[offset] =
1994 md->offset_vector[md->offset_end - number];
1995 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1996 if (offset_top <= offset) offset_top = offset + 2;
1997 }
1998 }
1999
2000 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2001 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2002 at a time from the outer level, thus saving stack. This must precede the
2003 empty string test - in this case that test is done at the outer level. */
2004
2005 if (*ecode == OP_KETRPOS)
2006 {
2007 md->start_match_ptr = mstart; /* In case \K reset it */
2008 md->end_match_ptr = eptr;
2009 md->end_offset_top = offset_top;
2010 RRETURN(MATCH_KETRPOS);
2011 }
2012
2013 /* For an ordinary non-repeating ket, just continue at this level. This
2014 also happens for a repeating ket if no characters were matched in the
2015 group. This is the forcible breaking of infinite loops as implemented in
2016 Perl 5.005. For a non-repeating atomic group that includes captures,
2017 establish a backup point by processing the rest of the pattern at a lower
2018 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2019 original OP_ONCE level, thereby bypassing intermediate backup points, but
2020 resetting any captures that happened along the way. */
2021
2022 if (*ecode == OP_KET || eptr == saved_eptr)
2023 {
2024 if (*prev == OP_ONCE)
2025 {
2026 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2027 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2028 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2029 RRETURN(MATCH_ONCE);
2030 }
2031 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2032 break;
2033 }
2034
2035 /* The normal repeating kets try the rest of the pattern or restart from
2036 the preceding bracket, in the appropriate order. In the second case, we can
2037 use tail recursion to avoid using another stack frame, unless we have an
2038 an atomic group or an unlimited repeat of a group that can match an empty
2039 string. */
2040
2041 if (*ecode == OP_KETRMIN)
2042 {
2043 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2044 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2045 if (*prev == OP_ONCE)
2046 {
2047 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2048 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2049 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2050 RRETURN(MATCH_ONCE);
2051 }
2052 if (*prev >= OP_SBRA) /* Could match an empty string */
2053 {
2054 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2055 RRETURN(rrc);
2056 }
2057 ecode = prev;
2058 goto TAIL_RECURSE;
2059 }
2060 else /* OP_KETRMAX */
2061 {
2062 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2063 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2064 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2065 if (*prev == OP_ONCE)
2066 {
2067 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2068 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2069 md->once_target = prev;
2070 RRETURN(MATCH_ONCE);
2071 }
2072 ecode += 1 + LINK_SIZE;
2073 goto TAIL_RECURSE;
2074 }
2075 /* Control never gets here */
2076
2077 /* Not multiline mode: start of subject assertion, unless notbol. */
2078
2079 case OP_CIRC:
2080 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2081
2082 /* Start of subject assertion */
2083
2084 case OP_SOD:
2085 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2086 ecode++;
2087 break;
2088
2089 /* Multiline mode: start of subject unless notbol, or after any newline. */
2090
2091 case OP_CIRCM:
2092 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2093 if (eptr != md->start_subject &&
2094 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2095 RRETURN(MATCH_NOMATCH);
2096 ecode++;
2097 break;
2098
2099 /* Start of match assertion */
2100
2101 case OP_SOM:
2102 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2103 ecode++;
2104 break;
2105
2106 /* Reset the start of match point */
2107
2108 case OP_SET_SOM:
2109 mstart = eptr;
2110 ecode++;
2111 break;
2112
2113 /* Multiline mode: assert before any newline, or before end of subject
2114 unless noteol is set. */
2115
2116 case OP_DOLLM:
2117 if (eptr < md->end_subject)
2118 {
2119 if (!IS_NEWLINE(eptr))
2120 {
2121 if (md->partial != 0 &&
2122 eptr + 1 >= md->end_subject &&
2123 NLBLOCK->nltype == NLTYPE_FIXED &&
2124 NLBLOCK->nllen == 2 &&
2125 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2126 {
2127 md->hitend = TRUE;
2128 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2129 }
2130 RRETURN(MATCH_NOMATCH);
2131 }
2132 }
2133 else
2134 {
2135 if (md->noteol) RRETURN(MATCH_NOMATCH);
2136 SCHECK_PARTIAL();
2137 }
2138 ecode++;
2139 break;
2140
2141 /* Not multiline mode: assert before a terminating newline or before end of
2142 subject unless noteol is set. */
2143
2144 case OP_DOLL:
2145 if (md->noteol) RRETURN(MATCH_NOMATCH);
2146 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2147
2148 /* ... else fall through for endonly */
2149
2150 /* End of subject assertion (\z) */
2151
2152 case OP_EOD:
2153 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2154 SCHECK_PARTIAL();
2155 ecode++;
2156 break;
2157
2158 /* End of subject or ending \n assertion (\Z) */
2159
2160 case OP_EODN:
2161 ASSERT_NL_OR_EOS:
2162 if (eptr < md->end_subject &&
2163 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2164 {
2165 if (md->partial != 0 &&
2166 eptr + 1 >= md->end_subject &&
2167 NLBLOCK->nltype == NLTYPE_FIXED &&
2168 NLBLOCK->nllen == 2 &&
2169 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2170 {
2171 md->hitend = TRUE;
2172 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2173 }
2174 RRETURN(MATCH_NOMATCH);
2175 }
2176
2177 /* Either at end of string or \n before end. */
2178
2179 SCHECK_PARTIAL();
2180 ecode++;
2181 break;
2182
2183 /* Word boundary assertions */
2184
2185 case OP_NOT_WORD_BOUNDARY:
2186 case OP_WORD_BOUNDARY:
2187 {
2188
2189 /* Find out if the previous and current characters are "word" characters.
2190 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2191 be "non-word" characters. Remember the earliest consulted character for
2192 partial matching. */
2193
2194#ifdef SUPPORT_UTF
2195 if (utf)
2196 {
2197 /* Get status of previous character */
2198
2199 if (eptr == md->start_subject) prev_is_word = FALSE; else
2200 {
2201 PCRE_PUCHAR lastptr = eptr - 1;
2202 BACKCHAR(lastptr);
2203 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2204 GETCHAR(c, lastptr);
2205#ifdef SUPPORT_UCP
2206 if (md->use_ucp)
2207 {
2208 if (c == '_') prev_is_word = TRUE; else
2209 {
2210 int cat = UCD_CATEGORY(c);
2211 prev_is_word = (cat == ucp_L || cat == ucp_N);
2212 }
2213 }
2214 else
2215#endif
2216 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2217 }
2218
2219 /* Get status of next character */
2220
2221 if (eptr >= md->end_subject)
2222 {
2223 SCHECK_PARTIAL();
2224 cur_is_word = FALSE;
2225 }
2226 else
2227 {
2228 GETCHAR(c, eptr);
2229#ifdef SUPPORT_UCP
2230 if (md->use_ucp)
2231 {
2232 if (c == '_') cur_is_word = TRUE; else
2233 {
2234 int cat = UCD_CATEGORY(c);
2235 cur_is_word = (cat == ucp_L || cat == ucp_N);
2236 }
2237 }
2238 else
2239#endif
2240 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2241 }
2242 }
2243 else
2244#endif
2245
2246 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2247 consistency with the behaviour of \w we do use it in this case. */
2248
2249 {
2250 /* Get status of previous character */
2251
2252 if (eptr == md->start_subject) prev_is_word = FALSE; else
2253 {
2254 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2255#ifdef SUPPORT_UCP
2256 if (md->use_ucp)
2257 {
2258 c = eptr[-1];
2259 if (c == '_') prev_is_word = TRUE; else
2260 {
2261 int cat = UCD_CATEGORY(c);
2262 prev_is_word = (cat == ucp_L || cat == ucp_N);
2263 }
2264 }
2265 else
2266#endif
2267 prev_is_word = MAX_255(eptr[-1])
2268 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2269 }
2270
2271 /* Get status of next character */
2272
2273 if (eptr >= md->end_subject)
2274 {
2275 SCHECK_PARTIAL();
2276 cur_is_word = FALSE;
2277 }
2278 else
2279#ifdef SUPPORT_UCP
2280 if (md->use_ucp)
2281 {
2282 c = *eptr;
2283 if (c == '_') cur_is_word = TRUE; else
2284 {
2285 int cat = UCD_CATEGORY(c);
2286 cur_is_word = (cat == ucp_L || cat == ucp_N);
2287 }
2288 }
2289 else
2290#endif
2291 cur_is_word = MAX_255(*eptr)
2292 && ((md->ctypes[*eptr] & ctype_word) != 0);
2293 }
2294
2295 /* Now see if the situation is what we want */
2296
2297 if ((*ecode++ == OP_WORD_BOUNDARY)?
2298 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2299 RRETURN(MATCH_NOMATCH);
2300 }
2301 break;
2302
2303 /* Match any single character type except newline; have to take care with
2304 CRLF newlines and partial matching. */
2305
2306 case OP_ANY:
2307 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2308 if (md->partial != 0 &&
2309 eptr == md->end_subject - 1 &&
2310 NLBLOCK->nltype == NLTYPE_FIXED &&
2311 NLBLOCK->nllen == 2 &&
2312 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2313 {
2314 md->hitend = TRUE;
2315 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2316 }
2317
2318 /* Fall through */
2319
2320 /* Match any single character whatsoever. */
2321
2322 case OP_ALLANY:
2323 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2324 { /* not be updated before SCHECK_PARTIAL. */
2325 SCHECK_PARTIAL();
2326 RRETURN(MATCH_NOMATCH);
2327 }
2328 eptr++;
2329#ifdef SUPPORT_UTF
2330 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2331#endif
2332 ecode++;
2333 break;
2334
2335 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2336 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2337
2338 case OP_ANYBYTE:
2339 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2340 { /* not be updated before SCHECK_PARTIAL. */
2341 SCHECK_PARTIAL();
2342 RRETURN(MATCH_NOMATCH);
2343 }
2344 eptr++;
2345 ecode++;
2346 break;
2347
2348 case OP_NOT_DIGIT:
2349 if (eptr >= md->end_subject)
2350 {
2351 SCHECK_PARTIAL();
2352 RRETURN(MATCH_NOMATCH);
2353 }
2354 GETCHARINCTEST(c, eptr);
2355 if (
2356#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2357 c < 256 &&
2358#endif
2359 (md->ctypes[c] & ctype_digit) != 0
2360 )
2361 RRETURN(MATCH_NOMATCH);
2362 ecode++;
2363 break;
2364
2365 case OP_DIGIT:
2366 if (eptr >= md->end_subject)
2367 {
2368 SCHECK_PARTIAL();
2369 RRETURN(MATCH_NOMATCH);
2370 }
2371 GETCHARINCTEST(c, eptr);
2372 if (
2373#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2374 c > 255 ||
2375#endif
2376 (md->ctypes[c] & ctype_digit) == 0
2377 )
2378 RRETURN(MATCH_NOMATCH);
2379 ecode++;
2380 break;
2381
2382 case OP_NOT_WHITESPACE:
2383 if (eptr >= md->end_subject)
2384 {
2385 SCHECK_PARTIAL();
2386 RRETURN(MATCH_NOMATCH);
2387 }
2388 GETCHARINCTEST(c, eptr);
2389 if (
2390#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2391 c < 256 &&
2392#endif
2393 (md->ctypes[c] & ctype_space) != 0
2394 )
2395 RRETURN(MATCH_NOMATCH);
2396 ecode++;
2397 break;
2398
2399 case OP_WHITESPACE:
2400 if (eptr >= md->end_subject)
2401 {
2402 SCHECK_PARTIAL();
2403 RRETURN(MATCH_NOMATCH);
2404 }
2405 GETCHARINCTEST(c, eptr);
2406 if (
2407#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2408 c > 255 ||
2409#endif
2410 (md->ctypes[c] & ctype_space) == 0
2411 )
2412 RRETURN(MATCH_NOMATCH);
2413 ecode++;
2414 break;
2415
2416 case OP_NOT_WORDCHAR:
2417 if (eptr >= md->end_subject)
2418 {
2419 SCHECK_PARTIAL();
2420 RRETURN(MATCH_NOMATCH);
2421 }
2422 GETCHARINCTEST(c, eptr);
2423 if (
2424#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2425 c < 256 &&
2426#endif
2427 (md->ctypes[c] & ctype_word) != 0
2428 )
2429 RRETURN(MATCH_NOMATCH);
2430 ecode++;
2431 break;
2432
2433 case OP_WORDCHAR:
2434 if (eptr >= md->end_subject)
2435 {
2436 SCHECK_PARTIAL();
2437 RRETURN(MATCH_NOMATCH);
2438 }
2439 GETCHARINCTEST(c, eptr);
2440 if (
2441#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2442 c > 255 ||
2443#endif
2444 (md->ctypes[c] & ctype_word) == 0
2445 )
2446 RRETURN(MATCH_NOMATCH);
2447 ecode++;
2448 break;
2449
2450 case OP_ANYNL:
2451 if (eptr >= md->end_subject)
2452 {
2453 SCHECK_PARTIAL();
2454 RRETURN(MATCH_NOMATCH);
2455 }
2456 GETCHARINCTEST(c, eptr);
2457 switch(c)
2458 {
2459 default: RRETURN(MATCH_NOMATCH);
2460
2461 case CHAR_CR:
2462 if (eptr >= md->end_subject)
2463 {
2464 SCHECK_PARTIAL();
2465 }
2466 else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++;
2467 break;
2468
2469 case CHAR_LF:
2470 break;
2471
2472 case CHAR_VT:
2473 case CHAR_FF:
2474 case CHAR_NEL:
2475#ifndef EBCDIC
2476 case 0x2028:
2477 case 0x2029:
2478#endif /* Not EBCDIC */
2479 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2480 break;
2481 }
2482 ecode++;
2483 break;
2484
2485 case OP_NOT_HSPACE:
2486 if (eptr >= md->end_subject)
2487 {
2488 SCHECK_PARTIAL();
2489 RRETURN(MATCH_NOMATCH);
2490 }
2491 GETCHARINCTEST(c, eptr);
2492 switch(c)
2493 {
2494 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2495 default: break;
2496 }
2497 ecode++;
2498 break;
2499
2500 case OP_HSPACE:
2501 if (eptr >= md->end_subject)
2502 {
2503 SCHECK_PARTIAL();
2504 RRETURN(MATCH_NOMATCH);
2505 }
2506 GETCHARINCTEST(c, eptr);
2507 switch(c)
2508 {
2509 HSPACE_CASES: break; /* Byte and multibyte cases */
2510 default: RRETURN(MATCH_NOMATCH);
2511 }
2512 ecode++;
2513 break;
2514
2515 case OP_NOT_VSPACE:
2516 if (eptr >= md->end_subject)
2517 {
2518 SCHECK_PARTIAL();
2519 RRETURN(MATCH_NOMATCH);
2520 }
2521 GETCHARINCTEST(c, eptr);
2522 switch(c)
2523 {
2524 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2525 default: break;
2526 }
2527 ecode++;
2528 break;
2529
2530 case OP_VSPACE:
2531 if (eptr >= md->end_subject)
2532 {
2533 SCHECK_PARTIAL();
2534 RRETURN(MATCH_NOMATCH);
2535 }
2536 GETCHARINCTEST(c, eptr);
2537 switch(c)
2538 {
2539 VSPACE_CASES: break;
2540 default: RRETURN(MATCH_NOMATCH);
2541 }
2542 ecode++;
2543 break;
2544
2545#ifdef SUPPORT_UCP
2546 /* Check the next character by Unicode property. We will get here only
2547 if the support is in the binary; otherwise a compile-time error occurs. */
2548
2549 case OP_PROP:
2550 case OP_NOTPROP:
2551 if (eptr >= md->end_subject)
2552 {
2553 SCHECK_PARTIAL();
2554 RRETURN(MATCH_NOMATCH);
2555 }
2556 GETCHARINCTEST(c, eptr);
2557 {
2558 const pcre_uint32 *cp;
2559 const ucd_record *prop = GET_UCD(c);
2560
2561 switch(ecode[1])
2562 {
2563 case PT_ANY:
2564 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2565 break;
2566
2567 case PT_LAMP:
2568 if ((prop->chartype == ucp_Lu ||
2569 prop->chartype == ucp_Ll ||
2570 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2571 RRETURN(MATCH_NOMATCH);
2572 break;
2573
2574 case PT_GC:
2575 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2576 RRETURN(MATCH_NOMATCH);
2577 break;
2578
2579 case PT_PC:
2580 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2581 RRETURN(MATCH_NOMATCH);
2582 break;
2583
2584 case PT_SC:
2585 if ((ecode[2] != prop->script) == (op == OP_PROP))
2586 RRETURN(MATCH_NOMATCH);
2587 break;
2588
2589 /* These are specials */
2590
2591 case PT_ALNUM:
2592 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2593 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2594 RRETURN(MATCH_NOMATCH);
2595 break;
2596
2597 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2598 which means that Perl space and POSIX space are now identical. PCRE
2599 was changed at release 8.34. */
2600
2601 case PT_SPACE: /* Perl space */
2602 case PT_PXSPACE: /* POSIX space */
2603 switch(c)
2604 {
2605 HSPACE_CASES:
2606 VSPACE_CASES:
2607 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2608 break;
2609
2610 default:
2611 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2612 (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2613 break;
2614 }
2615 break;
2616
2617 case PT_WORD:
2618 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2619 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2620 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2621 RRETURN(MATCH_NOMATCH);
2622 break;
2623
2624 case PT_CLIST:
2625 cp = PRIV(ucd_caseless_sets) + ecode[2];
2626 for (;;)
2627 {
2628 if (c < *cp)
2629 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2630 if (c == *cp++)
2631 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2632 }
2633 break;
2634
2635 case PT_UCNC:
2636 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2637 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2638 c >= 0xe000) == (op == OP_NOTPROP))
2639 RRETURN(MATCH_NOMATCH);
2640 break;
2641
2642 /* This should never occur */
2643
2644 default:
2645 RRETURN(PCRE_ERROR_INTERNAL);
2646 }
2647
2648 ecode += 3;
2649 }
2650 break;
2651
2652 /* Match an extended Unicode sequence. We will get here only if the support
2653 is in the binary; otherwise a compile-time error occurs. */
2654
2655 case OP_EXTUNI:
2656 if (eptr >= md->end_subject)
2657 {
2658 SCHECK_PARTIAL();
2659 RRETURN(MATCH_NOMATCH);
2660 }
2661 else
2662 {
2663 int lgb, rgb;
2664 GETCHARINCTEST(c, eptr);
2665 lgb = UCD_GRAPHBREAK(c);
2666 while (eptr < md->end_subject)
2667 {
2668 int len = 1;
2669 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2670 rgb = UCD_GRAPHBREAK(c);
2671 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2672 lgb = rgb;
2673 eptr += len;
2674 }
2675 }
2676 CHECK_PARTIAL();
2677 ecode++;
2678 break;
2679#endif /* SUPPORT_UCP */
2680
2681
2682 /* Match a back reference, possibly repeatedly. Look past the end of the
2683 item to see if there is repeat information following. The code is similar
2684 to that for character classes, but repeated for efficiency. Then obey
2685 similar code to character type repeats - written out again for speed.
2686 However, if the referenced string is the empty string, always treat
2687 it as matched, any number of times (otherwise there could be infinite
2688 loops). If the reference is unset, there are two possibilities:
2689
2690 (a) In the default, Perl-compatible state, set the length negative;
2691 this ensures that every attempt at a match fails. We can't just fail
2692 here, because of the possibility of quantifiers with zero minima.
2693
2694 (b) If the JavaScript compatibility flag is set, set the length to zero
2695 so that the back reference matches an empty string.
2696
2697 Otherwise, set the length to the length of what was matched by the
2698 referenced subpattern.
2699
2700 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
2701 or to a non-duplicated named group. For a duplicated named group, OP_DNREF
2702 and OP_DNREFI are used. In this case we must scan the list of groups to
2703 which the name refers, and use the first one that is set. */
2704
2705 case OP_DNREF:
2706 case OP_DNREFI:
2707 caseless = op == OP_DNREFI;
2708 {
2709 int count = GET2(ecode, 1+IMM2_SIZE);
2710 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
2711 ecode += 1 + 2*IMM2_SIZE;
2712
2713 /* Setting the default length first and initializing 'offset' avoids
2714 compiler warnings in the REF_REPEAT code. */
2715
2716 length = (md->jscript_compat)? 0 : -1;
2717 offset = 0;
2718
2719 while (count-- > 0)
2720 {
2721 offset = GET2(slot, 0) << 1;
2722 if (offset < offset_top && md->offset_vector[offset] >= 0)
2723 {
2724 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2725 break;
2726 }
2727 slot += md->name_entry_size;
2728 }
2729 }
2730 goto REF_REPEAT;
2731
2732 case OP_REF:
2733 case OP_REFI:
2734 caseless = op == OP_REFI;
2735 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2736 ecode += 1 + IMM2_SIZE;
2737 if (offset >= offset_top || md->offset_vector[offset] < 0)
2738 length = (md->jscript_compat)? 0 : -1;
2739 else
2740 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2741
2742 /* Set up for repetition, or handle the non-repeated case */
2743
2744 REF_REPEAT:
2745 switch (*ecode)
2746 {
2747 case OP_CRSTAR:
2748 case OP_CRMINSTAR:
2749 case OP_CRPLUS:
2750 case OP_CRMINPLUS:
2751 case OP_CRQUERY:
2752 case OP_CRMINQUERY:
2753 c = *ecode++ - OP_CRSTAR;
2754 minimize = (c & 1) != 0;
2755 min = rep_min[c]; /* Pick up values from tables; */
2756 max = rep_max[c]; /* zero for max => infinity */
2757 if (max == 0) max = INT_MAX;
2758 break;
2759
2760 case OP_CRRANGE:
2761 case OP_CRMINRANGE:
2762 minimize = (*ecode == OP_CRMINRANGE);
2763 min = GET2(ecode, 1);
2764 max = GET2(ecode, 1 + IMM2_SIZE);
2765 if (max == 0) max = INT_MAX;
2766 ecode += 1 + 2 * IMM2_SIZE;
2767 break;
2768
2769 default: /* No repeat follows */
2770 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2771 {
2772 if (length == -2) eptr = md->end_subject; /* Partial match */
2773 CHECK_PARTIAL();
2774 RRETURN(MATCH_NOMATCH);
2775 }
2776 eptr += length;
2777 continue; /* With the main loop */
2778 }
2779
2780 /* Handle repeated back references. If the length of the reference is
2781 zero, just continue with the main loop. If the length is negative, it
2782 means the reference is unset in non-Java-compatible mode. If the minimum is
2783 zero, we can continue at the same level without recursion. For any other
2784 minimum, carrying on will result in NOMATCH. */
2785
2786 if (length == 0) continue;
2787 if (length < 0 && min == 0) continue;
2788
2789 /* First, ensure the minimum number of matches are present. We get back
2790 the length of the reference string explicitly rather than passing the
2791 address of eptr, so that eptr can be a register variable. */
2792
2793 for (i = 1; i <= min; i++)
2794 {
2795 int slength;
2796 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2797 {
2798 if (slength == -2) eptr = md->end_subject; /* Partial match */
2799 CHECK_PARTIAL();
2800 RRETURN(MATCH_NOMATCH);
2801 }
2802 eptr += slength;
2803 }
2804
2805 /* If min = max, continue at the same level without recursion.
2806 They are not both allowed to be zero. */
2807
2808 if (min == max) continue;
2809
2810 /* If minimizing, keep trying and advancing the pointer */
2811
2812 if (minimize)
2813 {
2814 for (fi = min;; fi++)
2815 {
2816 int slength;
2817 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2818 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2819 if (fi >= max) RRETURN(MATCH_NOMATCH);
2820 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2821 {
2822 if (slength == -2) eptr = md->end_subject; /* Partial match */
2823 CHECK_PARTIAL();
2824 RRETURN(MATCH_NOMATCH);
2825 }
2826 eptr += slength;
2827 }
2828 /* Control never gets here */
2829 }
2830
2831 /* If maximizing, find the longest string and work backwards */
2832
2833 else
2834 {
2835 pp = eptr;
2836 for (i = min; i < max; i++)
2837 {
2838 int slength;
2839 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2840 {
2841 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2842 the soft partial matching case. */
2843
2844 if (slength == -2 && md->partial != 0 &&
2845 md->end_subject > md->start_used_ptr)
2846 {
2847 md->hitend = TRUE;
2848 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2849 }
2850 break;
2851 }
2852 eptr += slength;
2853 }
2854
2855 while (eptr >= pp)
2856 {
2857 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2858 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2859 eptr -= length;
2860 }
2861 RRETURN(MATCH_NOMATCH);
2862 }
2863 /* Control never gets here */
2864
2865 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2866 used when all the characters in the class have values in the range 0-255,
2867 and either the matching is caseful, or the characters are in the range
2868 0-127 when UTF-8 processing is enabled. The only difference between
2869 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2870 encountered.
2871
2872 First, look past the end of the item to see if there is repeat information
2873 following. Then obey similar code to character type repeats - written out
2874 again for speed. */
2875
2876 case OP_NCLASS:
2877 case OP_CLASS:
2878 {
2879 /* The data variable is saved across frames, so the byte map needs to
2880 be stored there. */
2881#define BYTE_MAP ((pcre_uint8 *)data)
2882 data = ecode + 1; /* Save for matching */
2883 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2884
2885 switch (*ecode)
2886 {
2887 case OP_CRSTAR:
2888 case OP_CRMINSTAR:
2889 case OP_CRPLUS:
2890 case OP_CRMINPLUS:
2891 case OP_CRQUERY:
2892 case OP_CRMINQUERY:
2893 case OP_CRPOSSTAR:
2894 case OP_CRPOSPLUS:
2895 case OP_CRPOSQUERY:
2896 c = *ecode++ - OP_CRSTAR;
2897 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
2898 else possessive = TRUE;
2899 min = rep_min[c]; /* Pick up values from tables; */
2900 max = rep_max[c]; /* zero for max => infinity */
2901 if (max == 0) max = INT_MAX;
2902 break;
2903
2904 case OP_CRRANGE:
2905 case OP_CRMINRANGE:
2906 case OP_CRPOSRANGE:
2907 minimize = (*ecode == OP_CRMINRANGE);
2908 possessive = (*ecode == OP_CRPOSRANGE);
2909 min = GET2(ecode, 1);
2910 max = GET2(ecode, 1 + IMM2_SIZE);
2911 if (max == 0) max = INT_MAX;
2912 ecode += 1 + 2 * IMM2_SIZE;
2913 break;
2914
2915 default: /* No repeat follows */
2916 min = max = 1;
2917 break;
2918 }
2919
2920 /* First, ensure the minimum number of matches are present. */
2921
2922#ifdef SUPPORT_UTF
2923 if (utf)
2924 {
2925 for (i = 1; i <= min; i++)
2926 {
2927 if (eptr >= md->end_subject)
2928 {
2929 SCHECK_PARTIAL();
2930 RRETURN(MATCH_NOMATCH);
2931 }
2932 GETCHARINC(c, eptr);
2933 if (c > 255)
2934 {
2935 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2936 }
2937 else
2938 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2939 }
2940 }
2941 else
2942#endif
2943 /* Not UTF mode */
2944 {
2945 for (i = 1; i <= min; i++)
2946 {
2947 if (eptr >= md->end_subject)
2948 {
2949 SCHECK_PARTIAL();
2950 RRETURN(MATCH_NOMATCH);
2951 }
2952 c = *eptr++;
2953#ifndef COMPILE_PCRE8
2954 if (c > 255)
2955 {
2956 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2957 }
2958 else
2959#endif
2960 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2961 }
2962 }
2963
2964 /* If max == min we can continue with the main loop without the
2965 need to recurse. */
2966
2967 if (min == max) continue;
2968
2969 /* If minimizing, keep testing the rest of the expression and advancing
2970 the pointer while it matches the class. */
2971
2972 if (minimize)
2973 {
2974#ifdef SUPPORT_UTF
2975 if (utf)
2976 {
2977 for (fi = min;; fi++)
2978 {
2979 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2980 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2981 if (fi >= max) RRETURN(MATCH_NOMATCH);
2982 if (eptr >= md->end_subject)
2983 {
2984 SCHECK_PARTIAL();
2985 RRETURN(MATCH_NOMATCH);
2986 }
2987 GETCHARINC(c, eptr);
2988 if (c > 255)
2989 {
2990 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2991 }
2992 else
2993 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2994 }
2995 }
2996 else
2997#endif
2998 /* Not UTF mode */
2999 {
3000 for (fi = min;; fi++)
3001 {
3002 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
3003 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3004 if (fi >= max) RRETURN(MATCH_NOMATCH);
3005 if (eptr >= md->end_subject)
3006 {
3007 SCHECK_PARTIAL();
3008 RRETURN(MATCH_NOMATCH);
3009 }
3010 c = *eptr++;
3011#ifndef COMPILE_PCRE8
3012 if (c > 255)
3013 {
3014 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3015 }
3016 else
3017#endif
3018 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3019 }
3020 }
3021 /* Control never gets here */
3022 }
3023
3024 /* If maximizing, find the longest possible run, then work backwards. */
3025
3026 else
3027 {
3028 pp = eptr;
3029
3030#ifdef SUPPORT_UTF
3031 if (utf)
3032 {
3033 for (i = min; i < max; i++)
3034 {
3035 int len = 1;
3036 if (eptr >= md->end_subject)
3037 {
3038 SCHECK_PARTIAL();
3039 break;
3040 }
3041 GETCHARLEN(c, eptr, len);
3042 if (c > 255)
3043 {
3044 if (op == OP_CLASS) break;
3045 }
3046 else
3047 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3048 eptr += len;
3049 }
3050
3051 if (possessive) continue; /* No backtracking */
3052
3053 for (;;)
3054 {
3055 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3056 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3057 if (eptr-- <= pp) break; /* Stop if tried at original pos */
3058 BACKCHAR(eptr);
3059 }
3060 }
3061 else
3062#endif
3063 /* Not UTF mode */
3064 {
3065 for (i = min; i < max; i++)
3066 {
3067 if (eptr >= md->end_subject)
3068 {
3069 SCHECK_PARTIAL();
3070 break;
3071 }
3072 c = *eptr;
3073#ifndef COMPILE_PCRE8
3074 if (c > 255)
3075 {
3076 if (op == OP_CLASS) break;
3077 }
3078 else
3079#endif
3080 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3081 eptr++;
3082 }
3083
3084 if (possessive) continue; /* No backtracking */
3085
3086 while (eptr >= pp)
3087 {
3088 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3089 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3090 eptr--;
3091 }
3092 }
3093
3094 RRETURN(MATCH_NOMATCH);
3095 }
3096#undef BYTE_MAP
3097 }
3098 /* Control never gets here */
3099
3100
3101 /* Match an extended character class. In the 8-bit library, this opcode is
3102 encountered only when UTF-8 mode mode is supported. In the 16-bit and
3103 32-bit libraries, codepoints greater than 255 may be encountered even when
3104 UTF is not supported. */
3105
3106#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3107 case OP_XCLASS:
3108 {
3109 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3110 ecode += GET(ecode, 1); /* Advance past the item */
3111
3112 switch (*ecode)
3113 {
3114 case OP_CRSTAR:
3115 case OP_CRMINSTAR:
3116 case OP_CRPLUS:
3117 case OP_CRMINPLUS:
3118 case OP_CRQUERY:
3119 case OP_CRMINQUERY:
3120 case OP_CRPOSSTAR:
3121 case OP_CRPOSPLUS:
3122 case OP_CRPOSQUERY:
3123 c = *ecode++ - OP_CRSTAR;
3124 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
3125 else possessive = TRUE;
3126 min = rep_min[c]; /* Pick up values from tables; */
3127 max = rep_max[c]; /* zero for max => infinity */
3128 if (max == 0) max = INT_MAX;
3129 break;
3130
3131 case OP_CRRANGE:
3132 case OP_CRMINRANGE:
3133 case OP_CRPOSRANGE:
3134 minimize = (*ecode == OP_CRMINRANGE);
3135 possessive = (*ecode == OP_CRPOSRANGE);
3136 min = GET2(ecode, 1);
3137 max = GET2(ecode, 1 + IMM2_SIZE);
3138 if (max == 0) max = INT_MAX;
3139 ecode += 1 + 2 * IMM2_SIZE;
3140 break;
3141
3142 default: /* No repeat follows */
3143 min = max = 1;
3144 break;
3145 }
3146
3147 /* First, ensure the minimum number of matches are present. */
3148
3149 for (i = 1; i <= min; i++)
3150 {
3151 if (eptr >= md->end_subject)
3152 {
3153 SCHECK_PARTIAL();
3154 RRETURN(MATCH_NOMATCH);
3155 }
3156 GETCHARINCTEST(c, eptr);
3157 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3158 }
3159
3160 /* If max == min we can continue with the main loop without the
3161 need to recurse. */
3162
3163 if (min == max) continue;
3164
3165 /* If minimizing, keep testing the rest of the expression and advancing
3166 the pointer while it matches the class. */
3167
3168 if (minimize)
3169 {
3170 for (fi = min;; fi++)
3171 {
3172 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3173 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3174 if (fi >= max) RRETURN(MATCH_NOMATCH);
3175 if (eptr >= md->end_subject)
3176 {
3177 SCHECK_PARTIAL();
3178 RRETURN(MATCH_NOMATCH);
3179 }
3180 GETCHARINCTEST(c, eptr);
3181 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3182 }
3183 /* Control never gets here */
3184 }
3185
3186 /* If maximizing, find the longest possible run, then work backwards. */
3187
3188 else
3189 {
3190 pp = eptr;
3191 for (i = min; i < max; i++)
3192 {
3193 int len = 1;
3194 if (eptr >= md->end_subject)
3195 {
3196 SCHECK_PARTIAL();
3197 break;
3198 }
3199#ifdef SUPPORT_UTF
3200 GETCHARLENTEST(c, eptr, len);
3201#else
3202 c = *eptr;
3203#endif
3204 if (!PRIV(xclass)(c, data, utf)) break;
3205 eptr += len;
3206 }
3207
3208 if (possessive) continue; /* No backtracking */
3209
3210 for(;;)
3211 {
3212 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3213 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3214 if (eptr-- <= pp) break; /* Stop if tried at original pos */
3215#ifdef SUPPORT_UTF
3216 if (utf) BACKCHAR(eptr);
3217#endif
3218 }
3219 RRETURN(MATCH_NOMATCH);
3220 }
3221
3222 /* Control never gets here */
3223 }
3224#endif /* End of XCLASS */
3225
3226 /* Match a single character, casefully */
3227
3228 case OP_CHAR:
3229#ifdef SUPPORT_UTF
3230 if (utf)
3231 {
3232 length = 1;
3233 ecode++;
3234 GETCHARLEN(fc, ecode, length);
3235 if (length > md->end_subject - eptr)
3236 {
3237 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3238 RRETURN(MATCH_NOMATCH);
3239 }
3240 while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH);
3241 }
3242 else
3243#endif
3244 /* Not UTF mode */
3245 {
3246 if (md->end_subject - eptr < 1)
3247 {
3248 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3249 RRETURN(MATCH_NOMATCH);
3250 }
3251 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3252 ecode += 2;
3253 }
3254 break;
3255
3256 /* Match a single character, caselessly. If we are at the end of the
3257 subject, give up immediately. */
3258
3259 case OP_CHARI:
3260 if (eptr >= md->end_subject)
3261 {
3262 SCHECK_PARTIAL();
3263 RRETURN(MATCH_NOMATCH);
3264 }
3265
3266#ifdef SUPPORT_UTF
3267 if (utf)
3268 {
3269 length = 1;
3270 ecode++;
3271 GETCHARLEN(fc, ecode, length);
3272
3273 /* If the pattern character's value is < 128, we have only one byte, and
3274 we know that its other case must also be one byte long, so we can use the
3275 fast lookup table. We know that there is at least one byte left in the
3276 subject. */
3277
3278 if (fc < 128)
3279 {
3280 pcre_uint32 cc = UCHAR21(eptr);
3281 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3282 ecode++;
3283 eptr++;
3284 }
3285
3286 /* Otherwise we must pick up the subject character. Note that we cannot
3287 use the value of "length" to check for sufficient bytes left, because the
3288 other case of the character may have more or fewer bytes. */
3289
3290 else
3291 {
3292 pcre_uint32 dc;
3293 GETCHARINC(dc, eptr);
3294 ecode += length;
3295
3296 /* If we have Unicode property support, we can use it to test the other
3297 case of the character, if there is one. */
3298
3299 if (fc != dc)
3300 {
3301#ifdef SUPPORT_UCP
3302 if (dc != UCD_OTHERCASE(fc))
3303#endif
3304 RRETURN(MATCH_NOMATCH);
3305 }
3306 }
3307 }
3308 else
3309#endif /* SUPPORT_UTF */
3310
3311 /* Not UTF mode */
3312 {
3313 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3314 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3315 eptr++;
3316 ecode += 2;
3317 }
3318 break;
3319
3320 /* Match a single character repeatedly. */
3321
3322 case OP_EXACT:
3323 case OP_EXACTI:
3324 min = max = GET2(ecode, 1);
3325 ecode += 1 + IMM2_SIZE;
3326 goto REPEATCHAR;
3327
3328 case OP_POSUPTO:
3329 case OP_POSUPTOI:
3330 possessive = TRUE;
3331 /* Fall through */
3332
3333 case OP_UPTO:
3334 case OP_UPTOI:
3335 case OP_MINUPTO:
3336 case OP_MINUPTOI:
3337 min = 0;
3338 max = GET2(ecode, 1);
3339 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3340 ecode += 1 + IMM2_SIZE;
3341 goto REPEATCHAR;
3342
3343 case OP_POSSTAR:
3344 case OP_POSSTARI:
3345 possessive = TRUE;
3346 min = 0;
3347 max = INT_MAX;
3348 ecode++;
3349 goto REPEATCHAR;
3350
3351 case OP_POSPLUS:
3352 case OP_POSPLUSI:
3353 possessive = TRUE;
3354 min = 1;
3355 max = INT_MAX;
3356 ecode++;
3357 goto REPEATCHAR;
3358
3359 case OP_POSQUERY:
3360 case OP_POSQUERYI:
3361 possessive = TRUE;
3362 min = 0;
3363 max = 1;
3364 ecode++;
3365 goto REPEATCHAR;
3366
3367 case OP_STAR:
3368 case OP_STARI:
3369 case OP_MINSTAR:
3370 case OP_MINSTARI:
3371 case OP_PLUS:
3372 case OP_PLUSI:
3373 case OP_MINPLUS:
3374 case OP_MINPLUSI:
3375 case OP_QUERY:
3376 case OP_QUERYI:
3377 case OP_MINQUERY:
3378 case OP_MINQUERYI:
3379 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3380 minimize = (c & 1) != 0;
3381 min = rep_min[c]; /* Pick up values from tables; */
3382 max = rep_max[c]; /* zero for max => infinity */
3383 if (max == 0) max = INT_MAX;
3384
3385 /* Common code for all repeated single-character matches. We first check
3386 for the minimum number of characters. If the minimum equals the maximum, we
3387 are done. Otherwise, if minimizing, check the rest of the pattern for a
3388 match; if there isn't one, advance up to the maximum, one character at a
3389 time.
3390
3391 If maximizing, advance up to the maximum number of matching characters,
3392 until eptr is past the end of the maximum run. If possessive, we are
3393 then done (no backing up). Otherwise, match at this position; anything
3394 other than no match is immediately returned. For nomatch, back up one
3395 character, unless we are matching \R and the last thing matched was
3396 \r\n, in which case, back up two bytes. When we reach the first optional
3397 character position, we can save stack by doing a tail recurse.
3398
3399 The various UTF/non-UTF and caseful/caseless cases are handled separately,
3400 for speed. */
3401
3402 REPEATCHAR:
3403#ifdef SUPPORT_UTF
3404 if (utf)
3405 {
3406 length = 1;
3407 charptr = ecode;
3408 GETCHARLEN(fc, ecode, length);
3409 ecode += length;
3410
3411 /* Handle multibyte character matching specially here. There is
3412 support for caseless matching if UCP support is present. */
3413
3414 if (length > 1)
3415 {
3416#ifdef SUPPORT_UCP
3417 pcre_uint32 othercase;
3418 if (op >= OP_STARI && /* Caseless */
3419 (othercase = UCD_OTHERCASE(fc)) != fc)
3420 oclength = PRIV(ord2utf)(othercase, occhars);
3421 else oclength = 0;
3422#endif /* SUPPORT_UCP */
3423
3424 for (i = 1; i <= min; i++)
3425 {
3426 if (eptr <= md->end_subject - length &&
3427 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3428#ifdef SUPPORT_UCP
3429 else if (oclength > 0 &&
3430 eptr <= md->end_subject - oclength &&
3431 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3432#endif /* SUPPORT_UCP */
3433 else
3434 {
3435 CHECK_PARTIAL();
3436 RRETURN(MATCH_NOMATCH);
3437 }
3438 }
3439
3440 if (min == max) continue;
3441
3442 if (minimize)
3443 {
3444 for (fi = min;; fi++)
3445 {
3446 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3447 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3448 if (fi >= max) RRETURN(MATCH_NOMATCH);
3449 if (eptr <= md->end_subject - length &&
3450 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3451#ifdef SUPPORT_UCP
3452 else if (oclength > 0 &&
3453 eptr <= md->end_subject - oclength &&
3454 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3455#endif /* SUPPORT_UCP */
3456 else
3457 {
3458 CHECK_PARTIAL();
3459 RRETURN(MATCH_NOMATCH);
3460 }
3461 }
3462 /* Control never gets here */
3463 }
3464
3465 else /* Maximize */
3466 {
3467 pp = eptr;
3468 for (i = min; i < max; i++)
3469 {
3470 if (eptr <= md->end_subject - length &&
3471 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3472#ifdef SUPPORT_UCP
3473 else if (oclength > 0 &&
3474 eptr <= md->end_subject - oclength &&
3475 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3476#endif /* SUPPORT_UCP */
3477 else
3478 {
3479 CHECK_PARTIAL();
3480 break;
3481 }
3482 }
3483
3484 if (possessive) continue; /* No backtracking */
3485 for(;;)
3486 {
3487 if (eptr <= pp) goto TAIL_RECURSE;
3488 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3489 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3490#ifdef SUPPORT_UCP
3491 eptr--;
3492 BACKCHAR(eptr);
3493#else /* without SUPPORT_UCP */
3494 eptr -= length;
3495#endif /* SUPPORT_UCP */
3496 }
3497 }
3498 /* Control never gets here */
3499 }
3500
3501 /* If the length of a UTF-8 character is 1, we fall through here, and
3502 obey the code as for non-UTF-8 characters below, though in this case the
3503 value of fc will always be < 128. */
3504 }
3505 else
3506#endif /* SUPPORT_UTF */
3507 /* When not in UTF-8 mode, load a single-byte character. */
3508 fc = *ecode++;
3509
3510 /* The value of fc at this point is always one character, though we may
3511 or may not be in UTF mode. The code is duplicated for the caseless and
3512 caseful cases, for speed, since matching characters is likely to be quite
3513 common. First, ensure the minimum number of matches are present. If min =
3514 max, continue at the same level without recursing. Otherwise, if
3515 minimizing, keep trying the rest of the expression and advancing one
3516 matching character if failing, up to the maximum. Alternatively, if
3517 maximizing, find the maximum number of characters and work backwards. */
3518
3519 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3520 max, (char *)eptr));
3521
3522 if (op >= OP_STARI) /* Caseless */
3523 {
3524#ifdef COMPILE_PCRE8
3525 /* fc must be < 128 if UTF is enabled. */
3526 foc = md->fcc[fc];
3527#else
3528#ifdef SUPPORT_UTF
3529#ifdef SUPPORT_UCP
3530 if (utf && fc > 127)
3531 foc = UCD_OTHERCASE(fc);
3532#else
3533 if (utf && fc > 127)
3534 foc = fc;
3535#endif /* SUPPORT_UCP */
3536 else
3537#endif /* SUPPORT_UTF */
3538 foc = TABLE_GET(fc, md->fcc, fc);
3539#endif /* COMPILE_PCRE8 */
3540
3541 for (i = 1; i <= min; i++)
3542 {
3543 pcre_uint32 cc; /* Faster than pcre_uchar */
3544 if (eptr >= md->end_subject)
3545 {
3546 SCHECK_PARTIAL();
3547 RRETURN(MATCH_NOMATCH);
3548 }
3549 cc = UCHAR21TEST(eptr);
3550 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3551 eptr++;
3552 }
3553 if (min == max) continue;
3554 if (minimize)
3555 {
3556 for (fi = min;; fi++)
3557 {
3558 pcre_uint32 cc; /* Faster than pcre_uchar */
3559 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3560 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3561 if (fi >= max) RRETURN(MATCH_NOMATCH);
3562 if (eptr >= md->end_subject)
3563 {
3564 SCHECK_PARTIAL();
3565 RRETURN(MATCH_NOMATCH);
3566 }
3567 cc = UCHAR21TEST(eptr);
3568 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3569 eptr++;
3570 }
3571 /* Control never gets here */
3572 }
3573 else /* Maximize */
3574 {
3575 pp = eptr;
3576 for (i = min; i < max; i++)
3577 {
3578 pcre_uint32 cc; /* Faster than pcre_uchar */
3579 if (eptr >= md->end_subject)
3580 {
3581 SCHECK_PARTIAL();
3582 break;
3583 }
3584 cc = UCHAR21TEST(eptr);
3585 if (fc != cc && foc != cc) break;
3586 eptr++;
3587 }
3588 if (possessive) continue; /* No backtracking */
3589 for (;;)
3590 {
3591 if (eptr == pp) goto TAIL_RECURSE;
3592 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3593 eptr--;
3594 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3595 }
3596 /* Control never gets here */
3597 }
3598 }
3599
3600 /* Caseful comparisons (includes all multi-byte characters) */
3601
3602 else
3603 {
3604 for (i = 1; i <= min; i++)
3605 {
3606 if (eptr >= md->end_subject)
3607 {
3608 SCHECK_PARTIAL();
3609 RRETURN(MATCH_NOMATCH);
3610 }
3611 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3612 }
3613
3614 if (min == max) continue;
3615
3616 if (minimize)
3617 {
3618 for (fi = min;; fi++)
3619 {
3620 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3621 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3622 if (fi >= max) RRETURN(MATCH_NOMATCH);
3623 if (eptr >= md->end_subject)
3624 {
3625 SCHECK_PARTIAL();
3626 RRETURN(MATCH_NOMATCH);
3627 }
3628 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3629 }
3630 /* Control never gets here */
3631 }
3632 else /* Maximize */
3633 {
3634 pp = eptr;
3635 for (i = min; i < max; i++)
3636 {
3637 if (eptr >= md->end_subject)
3638 {
3639 SCHECK_PARTIAL();
3640 break;
3641 }
3642 if (fc != UCHAR21TEST(eptr)) break;
3643 eptr++;
3644 }
3645 if (possessive) continue; /* No backtracking */
3646 for (;;)
3647 {
3648 if (eptr == pp) goto TAIL_RECURSE;
3649 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3650 eptr--;
3651 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3652 }
3653 /* Control never gets here */
3654 }
3655 }
3656 /* Control never gets here */
3657
3658 /* Match a negated single one-byte character. The character we are
3659 checking can be multibyte. */
3660
3661 case OP_NOT:
3662 case OP_NOTI:
3663 if (eptr >= md->end_subject)
3664 {
3665 SCHECK_PARTIAL();
3666 RRETURN(MATCH_NOMATCH);
3667 }
3668#ifdef SUPPORT_UTF
3669 if (utf)
3670 {
3671 register pcre_uint32 ch, och;
3672
3673 ecode++;
3674 GETCHARINC(ch, ecode);
3675 GETCHARINC(c, eptr);
3676
3677 if (op == OP_NOT)
3678 {
3679 if (ch == c) RRETURN(MATCH_NOMATCH);
3680 }
3681 else
3682 {
3683#ifdef SUPPORT_UCP
3684 if (ch > 127)
3685 och = UCD_OTHERCASE(ch);
3686#else
3687 if (ch > 127)
3688 och = ch;
3689#endif /* SUPPORT_UCP */
3690 else
3691 och = TABLE_GET(ch, md->fcc, ch);
3692 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3693 }
3694 }
3695 else
3696#endif
3697 {
3698 register pcre_uint32 ch = ecode[1];
3699 c = *eptr++;
3700 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3701 RRETURN(MATCH_NOMATCH);
3702 ecode += 2;
3703 }
3704 break;
3705
3706 /* Match a negated single one-byte character repeatedly. This is almost a
3707 repeat of the code for a repeated single character, but I haven't found a
3708 nice way of commoning these up that doesn't require a test of the
3709 positive/negative option for each character match. Maybe that wouldn't add
3710 very much to the time taken, but character matching *is* what this is all
3711 about... */
3712
3713 case OP_NOTEXACT:
3714 case OP_NOTEXACTI:
3715 min = max = GET2(ecode, 1);
3716 ecode += 1 + IMM2_SIZE;
3717 goto REPEATNOTCHAR;
3718
3719 case OP_NOTUPTO:
3720 case OP_NOTUPTOI:
3721 case OP_NOTMINUPTO:
3722 case OP_NOTMINUPTOI:
3723 min = 0;
3724 max = GET2(ecode, 1);
3725 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3726 ecode += 1 + IMM2_SIZE;
3727 goto REPEATNOTCHAR;
3728
3729 case OP_NOTPOSSTAR:
3730 case OP_NOTPOSSTARI:
3731 possessive = TRUE;
3732 min = 0;
3733 max = INT_MAX;
3734 ecode++;
3735 goto REPEATNOTCHAR;
3736
3737 case OP_NOTPOSPLUS:
3738 case OP_NOTPOSPLUSI:
3739 possessive = TRUE;
3740 min = 1;
3741 max = INT_MAX;
3742 ecode++;
3743 goto REPEATNOTCHAR;
3744
3745 case OP_NOTPOSQUERY:
3746 case OP_NOTPOSQUERYI:
3747 possessive = TRUE;
3748 min = 0;
3749 max = 1;
3750 ecode++;
3751 goto REPEATNOTCHAR;
3752
3753 case OP_NOTPOSUPTO:
3754 case OP_NOTPOSUPTOI:
3755 possessive = TRUE;
3756 min = 0;
3757 max = GET2(ecode, 1);
3758 ecode += 1 + IMM2_SIZE;
3759 goto REPEATNOTCHAR;
3760
3761 case OP_NOTSTAR:
3762 case OP_NOTSTARI:
3763 case OP_NOTMINSTAR:
3764 case OP_NOTMINSTARI:
3765 case OP_NOTPLUS:
3766 case OP_NOTPLUSI:
3767 case OP_NOTMINPLUS:
3768 case OP_NOTMINPLUSI:
3769 case OP_NOTQUERY:
3770 case OP_NOTQUERYI:
3771 case OP_NOTMINQUERY:
3772 case OP_NOTMINQUERYI:
3773 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3774 minimize = (c & 1) != 0;
3775 min = rep_min[c]; /* Pick up values from tables; */
3776 max = rep_max[c]; /* zero for max => infinity */
3777 if (max == 0) max = INT_MAX;
3778
3779 /* Common code for all repeated single-byte matches. */
3780
3781 REPEATNOTCHAR:
3782 GETCHARINCTEST(fc, ecode);
3783
3784 /* The code is duplicated for the caseless and caseful cases, for speed,
3785 since matching characters is likely to be quite common. First, ensure the
3786 minimum number of matches are present. If min = max, continue at the same
3787 level without recursing. Otherwise, if minimizing, keep trying the rest of
3788 the expression and advancing one matching character if failing, up to the
3789 maximum. Alternatively, if maximizing, find the maximum number of
3790 characters and work backwards. */
3791
3792 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3793 max, (char *)eptr));
3794
3795 if (op >= OP_NOTSTARI) /* Caseless */
3796 {
3797#ifdef SUPPORT_UTF
3798#ifdef SUPPORT_UCP
3799 if (utf && fc > 127)
3800 foc = UCD_OTHERCASE(fc);
3801#else
3802 if (utf && fc > 127)
3803 foc = fc;
3804#endif /* SUPPORT_UCP */
3805 else
3806#endif /* SUPPORT_UTF */
3807 foc = TABLE_GET(fc, md->fcc, fc);
3808
3809#ifdef SUPPORT_UTF
3810 if (utf)
3811 {
3812 register pcre_uint32 d;
3813 for (i = 1; i <= min; i++)
3814 {
3815 if (eptr >= md->end_subject)
3816 {
3817 SCHECK_PARTIAL();
3818 RRETURN(MATCH_NOMATCH);
3819 }
3820 GETCHARINC(d, eptr);
3821 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3822 }
3823 }
3824 else
3825#endif /* SUPPORT_UTF */
3826 /* Not UTF mode */
3827 {
3828 for (i = 1; i <= min; i++)
3829 {
3830 if (eptr >= md->end_subject)
3831 {
3832 SCHECK_PARTIAL();
3833 RRETURN(MATCH_NOMATCH);
3834 }
3835 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3836 eptr++;
3837 }
3838 }
3839
3840 if (min == max) continue;
3841
3842 if (minimize)
3843 {
3844#ifdef SUPPORT_UTF
3845 if (utf)
3846 {
3847 register pcre_uint32 d;
3848 for (fi = min;; fi++)
3849 {
3850 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3851 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3852 if (fi >= max) RRETURN(MATCH_NOMATCH);
3853 if (eptr >= md->end_subject)
3854 {
3855 SCHECK_PARTIAL();
3856 RRETURN(MATCH_NOMATCH);
3857 }
3858 GETCHARINC(d, eptr);
3859 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3860 }
3861 }
3862 else
3863#endif /*SUPPORT_UTF */
3864 /* Not UTF mode */
3865 {
3866 for (fi = min;; fi++)
3867 {
3868 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3869 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3870 if (fi >= max) RRETURN(MATCH_NOMATCH);
3871 if (eptr >= md->end_subject)
3872 {
3873 SCHECK_PARTIAL();
3874 RRETURN(MATCH_NOMATCH);
3875 }
3876 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3877 eptr++;
3878 }
3879 }
3880 /* Control never gets here */
3881 }
3882
3883 /* Maximize case */
3884
3885 else
3886 {
3887 pp = eptr;
3888
3889#ifdef SUPPORT_UTF
3890 if (utf)
3891 {
3892 register pcre_uint32 d;
3893 for (i = min; i < max; i++)
3894 {
3895 int len = 1;
3896 if (eptr >= md->end_subject)
3897 {
3898 SCHECK_PARTIAL();
3899 break;
3900 }
3901 GETCHARLEN(d, eptr, len);
3902 if (fc == d || (unsigned int)foc == d) break;
3903 eptr += len;
3904 }
3905 if (possessive) continue; /* No backtracking */
3906 for(;;)
3907 {
3908 if (eptr <= pp) goto TAIL_RECURSE;
3909 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3910 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3911 eptr--;
3912 BACKCHAR(eptr);
3913 }
3914 }
3915 else
3916#endif /* SUPPORT_UTF */
3917 /* Not UTF mode */
3918 {
3919 for (i = min; i < max; i++)
3920 {
3921 if (eptr >= md->end_subject)
3922 {
3923 SCHECK_PARTIAL();
3924 break;
3925 }
3926 if (fc == *eptr || foc == *eptr) break;
3927 eptr++;
3928 }
3929 if (possessive) continue; /* No backtracking */
3930 for (;;)
3931 {
3932 if (eptr == pp) goto TAIL_RECURSE;
3933 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3934 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3935 eptr--;
3936 }
3937 }
3938 /* Control never gets here */
3939 }
3940 }
3941
3942 /* Caseful comparisons */
3943
3944 else
3945 {
3946#ifdef SUPPORT_UTF
3947 if (utf)
3948 {
3949 register pcre_uint32 d;
3950 for (i = 1; i <= min; i++)
3951 {
3952 if (eptr >= md->end_subject)
3953 {
3954 SCHECK_PARTIAL();
3955 RRETURN(MATCH_NOMATCH);
3956 }
3957 GETCHARINC(d, eptr);
3958 if (fc == d) RRETURN(MATCH_NOMATCH);
3959 }
3960 }
3961 else
3962#endif
3963 /* Not UTF mode */
3964 {
3965 for (i = 1; i <= min; i++)
3966 {
3967 if (eptr >= md->end_subject)
3968 {
3969 SCHECK_PARTIAL();
3970 RRETURN(MATCH_NOMATCH);
3971 }
3972 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3973 }
3974 }
3975
3976 if (min == max) continue;
3977
3978 if (minimize)
3979 {
3980#ifdef SUPPORT_UTF
3981 if (utf)
3982 {
3983 register pcre_uint32 d;
3984 for (fi = min;; fi++)
3985 {
3986 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3987 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3988 if (fi >= max) RRETURN(MATCH_NOMATCH);
3989 if (eptr >= md->end_subject)
3990 {
3991 SCHECK_PARTIAL();
3992 RRETURN(MATCH_NOMATCH);
3993 }
3994 GETCHARINC(d, eptr);
3995 if (fc == d) RRETURN(MATCH_NOMATCH);
3996 }
3997 }
3998 else
3999#endif
4000 /* Not UTF mode */
4001 {
4002 for (fi = min;; fi++)
4003 {
4004 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
4005 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4006 if (fi >= max) RRETURN(MATCH_NOMATCH);
4007 if (eptr >= md->end_subject)
4008 {
4009 SCHECK_PARTIAL();
4010 RRETURN(MATCH_NOMATCH);
4011 }
4012 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
4013 }
4014 }
4015 /* Control never gets here */
4016 }
4017
4018 /* Maximize case */
4019
4020 else
4021 {
4022 pp = eptr;
4023
4024#ifdef SUPPORT_UTF
4025 if (utf)
4026 {
4027 register pcre_uint32 d;
4028 for (i = min; i < max; i++)
4029 {
4030 int len = 1;
4031 if (eptr >= md->end_subject)
4032 {
4033 SCHECK_PARTIAL();
4034 break;
4035 }
4036 GETCHARLEN(d, eptr, len);
4037 if (fc == d) break;
4038 eptr += len;
4039 }
4040 if (possessive) continue; /* No backtracking */
4041 for(;;)
4042 {
4043 if (eptr <= pp) goto TAIL_RECURSE;
4044 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4045 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4046 eptr--;
4047 BACKCHAR(eptr);
4048 }
4049 }
4050 else
4051#endif
4052 /* Not UTF mode */
4053 {
4054 for (i = min; i < max; i++)
4055 {
4056 if (eptr >= md->end_subject)
4057 {
4058 SCHECK_PARTIAL();
4059 break;
4060 }
4061 if (fc == *eptr) break;
4062 eptr++;
4063 }
4064 if (possessive) continue; /* No backtracking */
4065 for (;;)
4066 {
4067 if (eptr == pp) goto TAIL_RECURSE;
4068 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4069 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4070 eptr--;
4071 }
4072 }
4073 /* Control never gets here */
4074 }
4075 }
4076 /* Control never gets here */
4077
4078 /* Match a single character type repeatedly; several different opcodes
4079 share code. This is very similar to the code for single characters, but we
4080 repeat it in the interests of efficiency. */
4081
4082 case OP_TYPEEXACT:
4083 min = max = GET2(ecode, 1);
4084 minimize = TRUE;
4085 ecode += 1 + IMM2_SIZE;
4086 goto REPEATTYPE;
4087
4088 case OP_TYPEUPTO:
4089 case OP_TYPEMINUPTO:
4090 min = 0;
4091 max = GET2(ecode, 1);
4092 minimize = *ecode == OP_TYPEMINUPTO;
4093 ecode += 1 + IMM2_SIZE;
4094 goto REPEATTYPE;
4095
4096 case OP_TYPEPOSSTAR:
4097 possessive = TRUE;
4098 min = 0;
4099 max = INT_MAX;
4100 ecode++;
4101 goto REPEATTYPE;
4102
4103 case OP_TYPEPOSPLUS:
4104 possessive = TRUE;
4105 min = 1;
4106 max = INT_MAX;
4107 ecode++;
4108 goto REPEATTYPE;
4109
4110 case OP_TYPEPOSQUERY:
4111 possessive = TRUE;
4112 min = 0;
4113 max = 1;
4114 ecode++;
4115 goto REPEATTYPE;
4116
4117 case OP_TYPEPOSUPTO:
4118 possessive = TRUE;
4119 min = 0;
4120 max = GET2(ecode, 1);
4121 ecode += 1 + IMM2_SIZE;
4122 goto REPEATTYPE;
4123
4124 case OP_TYPESTAR:
4125 case OP_TYPEMINSTAR:
4126 case OP_TYPEPLUS:
4127 case OP_TYPEMINPLUS:
4128 case OP_TYPEQUERY:
4129 case OP_TYPEMINQUERY:
4130 c = *ecode++ - OP_TYPESTAR;
4131 minimize = (c & 1) != 0;
4132 min = rep_min[c]; /* Pick up values from tables; */
4133 max = rep_max[c]; /* zero for max => infinity */
4134 if (max == 0) max = INT_MAX;
4135
4136 /* Common code for all repeated single character type matches. Note that
4137 in UTF-8 mode, '.' matches a character of any length, but for the other
4138 character types, the valid characters are all one-byte long. */
4139
4140 REPEATTYPE:
4141 ctype = *ecode++; /* Code for the character type */
4142
4143#ifdef SUPPORT_UCP
4144 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4145 {
4146 prop_fail_result = ctype == OP_NOTPROP;
4147 prop_type = *ecode++;
4148 prop_value = *ecode++;
4149 }
4150 else prop_type = -1;
4151#endif
4152
4153 /* First, ensure the minimum number of matches are present. Use inline
4154 code for maximizing the speed, and do the type test once at the start
4155 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4156 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4157 and single-bytes. */
4158
4159 if (min > 0)
4160 {
4161#ifdef SUPPORT_UCP
4162 if (prop_type >= 0)
4163 {
4164 switch(prop_type)
4165 {
4166 case PT_ANY:
4167 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4168 for (i = 1; i <= min; i++)
4169 {
4170 if (eptr >= md->end_subject)
4171 {
4172 SCHECK_PARTIAL();
4173 RRETURN(MATCH_NOMATCH);
4174 }
4175 GETCHARINCTEST(c, eptr);
4176 }
4177 break;
4178
4179 case PT_LAMP:
4180 for (i = 1; i <= min; i++)
4181 {
4182 int chartype;
4183 if (eptr >= md->end_subject)
4184 {
4185 SCHECK_PARTIAL();
4186 RRETURN(MATCH_NOMATCH);
4187 }
4188 GETCHARINCTEST(c, eptr);
4189 chartype = UCD_CHARTYPE(c);
4190 if ((chartype == ucp_Lu ||
4191 chartype == ucp_Ll ||
4192 chartype == ucp_Lt) == prop_fail_result)
4193 RRETURN(MATCH_NOMATCH);
4194 }
4195 break;
4196
4197 case PT_GC:
4198 for (i = 1; i <= min; i++)
4199 {
4200 if (eptr >= md->end_subject)
4201 {
4202 SCHECK_PARTIAL();
4203 RRETURN(MATCH_NOMATCH);
4204 }
4205 GETCHARINCTEST(c, eptr);
4206 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4207 RRETURN(MATCH_NOMATCH);
4208 }
4209 break;
4210
4211 case PT_PC:
4212 for (i = 1; i <= min; i++)
4213 {
4214 if (eptr >= md->end_subject)
4215 {
4216 SCHECK_PARTIAL();
4217 RRETURN(MATCH_NOMATCH);
4218 }
4219 GETCHARINCTEST(c, eptr);
4220 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4221 RRETURN(MATCH_NOMATCH);
4222 }
4223 break;
4224
4225 case PT_SC:
4226 for (i = 1; i <= min; i++)
4227 {
4228 if (eptr >= md->end_subject)
4229 {
4230 SCHECK_PARTIAL();
4231 RRETURN(MATCH_NOMATCH);
4232 }
4233 GETCHARINCTEST(c, eptr);
4234 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4235 RRETURN(MATCH_NOMATCH);
4236 }
4237 break;
4238
4239 case PT_ALNUM:
4240 for (i = 1; i <= min; i++)
4241 {
4242 int category;
4243 if (eptr >= md->end_subject)
4244 {
4245 SCHECK_PARTIAL();
4246 RRETURN(MATCH_NOMATCH);
4247 }
4248 GETCHARINCTEST(c, eptr);
4249 category = UCD_CATEGORY(c);
4250 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4251 RRETURN(MATCH_NOMATCH);
4252 }
4253 break;
4254
4255 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4256 which means that Perl space and POSIX space are now identical. PCRE
4257 was changed at release 8.34. */
4258
4259 case PT_SPACE: /* Perl space */
4260 case PT_PXSPACE: /* POSIX space */
4261 for (i = 1; i <= min; i++)
4262 {
4263 if (eptr >= md->end_subject)
4264 {
4265 SCHECK_PARTIAL();
4266 RRETURN(MATCH_NOMATCH);
4267 }
4268 GETCHARINCTEST(c, eptr);
4269 switch(c)
4270 {
4271 HSPACE_CASES:
4272 VSPACE_CASES:
4273 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4274 break;
4275
4276 default:
4277 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
4278 RRETURN(MATCH_NOMATCH);
4279 break;
4280 }
4281 }
4282 break;
4283
4284 case PT_WORD:
4285 for (i = 1; i <= min; i++)
4286 {
4287 int category;
4288 if (eptr >= md->end_subject)
4289 {
4290 SCHECK_PARTIAL();
4291 RRETURN(MATCH_NOMATCH);
4292 }
4293 GETCHARINCTEST(c, eptr);
4294 category = UCD_CATEGORY(c);
4295 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4296 == prop_fail_result)
4297 RRETURN(MATCH_NOMATCH);
4298 }
4299 break;
4300
4301 case PT_CLIST:
4302 for (i = 1; i <= min; i++)
4303 {
4304 const pcre_uint32 *cp;
4305 if (eptr >= md->end_subject)
4306 {
4307 SCHECK_PARTIAL();
4308 RRETURN(MATCH_NOMATCH);
4309 }
4310 GETCHARINCTEST(c, eptr);
4311 cp = PRIV(ucd_caseless_sets) + prop_value;
4312 for (;;)
4313 {
4314 if (c < *cp)
4315 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4316 if (c == *cp++)
4317 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4318 }
4319 }
4320 break;
4321
4322 case PT_UCNC:
4323 for (i = 1; i <= min; i++)
4324 {
4325 if (eptr >= md->end_subject)
4326 {
4327 SCHECK_PARTIAL();
4328 RRETURN(MATCH_NOMATCH);
4329 }
4330 GETCHARINCTEST(c, eptr);
4331 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4332 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4333 c >= 0xe000) == prop_fail_result)
4334 RRETURN(MATCH_NOMATCH);
4335 }
4336 break;
4337
4338 /* This should not occur */
4339
4340 default:
4341 RRETURN(PCRE_ERROR_INTERNAL);
4342 }
4343 }
4344
4345 /* Match extended Unicode sequences. We will get here only if the
4346 support is in the binary; otherwise a compile-time error occurs. */
4347
4348 else if (ctype == OP_EXTUNI)
4349 {
4350 for (i = 1; i <= min; i++)
4351 {
4352 if (eptr >= md->end_subject)
4353 {
4354 SCHECK_PARTIAL();
4355 RRETURN(MATCH_NOMATCH);
4356 }
4357 else
4358 {
4359 int lgb, rgb;
4360 GETCHARINCTEST(c, eptr);
4361 lgb = UCD_GRAPHBREAK(c);
4362 while (eptr < md->end_subject)
4363 {
4364 int len = 1;
4365 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4366 rgb = UCD_GRAPHBREAK(c);
4367 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4368 lgb = rgb;
4369 eptr += len;
4370 }
4371 }
4372 CHECK_PARTIAL();
4373 }
4374 }
4375
4376 else
4377#endif /* SUPPORT_UCP */
4378
4379/* Handle all other cases when the coding is UTF-8 */
4380
4381#ifdef SUPPORT_UTF
4382 if (utf) switch(ctype)
4383 {
4384 case OP_ANY:
4385 for (i = 1; i <= min; i++)
4386 {
4387 if (eptr >= md->end_subject)
4388 {
4389 SCHECK_PARTIAL();
4390 RRETURN(MATCH_NOMATCH);
4391 }
4392 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4393 if (md->partial != 0 &&
4394 eptr + 1 >= md->end_subject &&
4395 NLBLOCK->nltype == NLTYPE_FIXED &&
4396 NLBLOCK->nllen == 2 &&
4397 UCHAR21(eptr) == NLBLOCK->nl[0])
4398 {
4399 md->hitend = TRUE;
4400 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4401 }
4402 eptr++;
4403 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4404 }
4405 break;
4406
4407 case OP_ALLANY:
4408 for (i = 1; i <= min; i++)
4409 {
4410 if (eptr >= md->end_subject)
4411 {
4412 SCHECK_PARTIAL();
4413 RRETURN(MATCH_NOMATCH);
4414 }
4415 eptr++;
4416 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4417 }
4418 break;
4419
4420 case OP_ANYBYTE:
4421 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4422 eptr += min;
4423 break;
4424
4425 case OP_ANYNL:
4426 for (i = 1; i <= min; i++)
4427 {
4428 if (eptr >= md->end_subject)
4429 {
4430 SCHECK_PARTIAL();
4431 RRETURN(MATCH_NOMATCH);
4432 }
4433 GETCHARINC(c, eptr);
4434 switch(c)
4435 {
4436 default: RRETURN(MATCH_NOMATCH);
4437
4438 case CHAR_CR:
4439 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
4440 break;
4441
4442 case CHAR_LF:
4443 break;
4444
4445 case CHAR_VT:
4446 case CHAR_FF:
4447 case CHAR_NEL:
4448#ifndef EBCDIC
4449 case 0x2028:
4450 case 0x2029:
4451#endif /* Not EBCDIC */
4452 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4453 break;
4454 }
4455 }
4456 break;
4457
4458 case OP_NOT_HSPACE:
4459 for (i = 1; i <= min; i++)
4460 {
4461 if (eptr >= md->end_subject)
4462 {
4463 SCHECK_PARTIAL();
4464 RRETURN(MATCH_NOMATCH);
4465 }
4466 GETCHARINC(c, eptr);
4467 switch(c)
4468 {
4469 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4470 default: break;
4471 }
4472 }
4473 break;
4474
4475 case OP_HSPACE:
4476 for (i = 1; i <= min; i++)
4477 {
4478 if (eptr >= md->end_subject)
4479 {
4480 SCHECK_PARTIAL();
4481 RRETURN(MATCH_NOMATCH);
4482 }
4483 GETCHARINC(c, eptr);
4484 switch(c)
4485 {
4486 HSPACE_CASES: break; /* Byte and multibyte cases */
4487 default: RRETURN(MATCH_NOMATCH);
4488 }
4489 }
4490 break;
4491
4492 case OP_NOT_VSPACE:
4493 for (i = 1; i <= min; i++)
4494 {
4495 if (eptr >= md->end_subject)
4496 {
4497 SCHECK_PARTIAL();
4498 RRETURN(MATCH_NOMATCH);
4499 }
4500 GETCHARINC(c, eptr);
4501 switch(c)
4502 {
4503 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4504 default: break;
4505 }
4506 }
4507 break;
4508
4509 case OP_VSPACE:
4510 for (i = 1; i <= min; i++)
4511 {
4512 if (eptr >= md->end_subject)
4513 {
4514 SCHECK_PARTIAL();
4515 RRETURN(MATCH_NOMATCH);
4516 }
4517 GETCHARINC(c, eptr);
4518 switch(c)
4519 {
4520 VSPACE_CASES: break;
4521 default: RRETURN(MATCH_NOMATCH);
4522 }
4523 }
4524 break;
4525
4526 case OP_NOT_DIGIT:
4527 for (i = 1; i <= min; i++)
4528 {
4529 if (eptr >= md->end_subject)
4530 {
4531 SCHECK_PARTIAL();
4532 RRETURN(MATCH_NOMATCH);
4533 }
4534 GETCHARINC(c, eptr);
4535 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4536 RRETURN(MATCH_NOMATCH);
4537 }
4538 break;
4539
4540 case OP_DIGIT:
4541 for (i = 1; i <= min; i++)
4542 {
4543 pcre_uint32 cc;
4544 if (eptr >= md->end_subject)
4545 {
4546 SCHECK_PARTIAL();
4547 RRETURN(MATCH_NOMATCH);
4548 }
4549 cc = UCHAR21(eptr);
4550 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4551 RRETURN(MATCH_NOMATCH);
4552 eptr++;
4553 /* No need to skip more bytes - we know it's a 1-byte character */
4554 }
4555 break;
4556
4557 case OP_NOT_WHITESPACE:
4558 for (i = 1; i <= min; i++)
4559 {
4560 pcre_uint32 cc;
4561 if (eptr >= md->end_subject)
4562 {
4563 SCHECK_PARTIAL();
4564 RRETURN(MATCH_NOMATCH);
4565 }
4566 cc = UCHAR21(eptr);
4567 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4568 RRETURN(MATCH_NOMATCH);
4569 eptr++;
4570 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4571 }
4572 break;
4573
4574 case OP_WHITESPACE:
4575 for (i = 1; i <= min; i++)
4576 {
4577 pcre_uint32 cc;
4578 if (eptr >= md->end_subject)
4579 {
4580 SCHECK_PARTIAL();
4581 RRETURN(MATCH_NOMATCH);
4582 }
4583 cc = UCHAR21(eptr);
4584 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4585 RRETURN(MATCH_NOMATCH);
4586 eptr++;
4587 /* No need to skip more bytes - we know it's a 1-byte character */
4588 }
4589 break;
4590
4591 case OP_NOT_WORDCHAR:
4592 for (i = 1; i <= min; i++)
4593 {
4594 pcre_uint32 cc;
4595 if (eptr >= md->end_subject)
4596 {
4597 SCHECK_PARTIAL();
4598 RRETURN(MATCH_NOMATCH);
4599 }
4600 cc = UCHAR21(eptr);
4601 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4602 RRETURN(MATCH_NOMATCH);
4603 eptr++;
4604 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4605 }
4606 break;
4607
4608 case OP_WORDCHAR:
4609 for (i = 1; i <= min; i++)
4610 {
4611 pcre_uint32 cc;
4612 if (eptr >= md->end_subject)
4613 {
4614 SCHECK_PARTIAL();
4615 RRETURN(MATCH_NOMATCH);
4616 }
4617 cc = UCHAR21(eptr);
4618 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4619 RRETURN(MATCH_NOMATCH);
4620 eptr++;
4621 /* No need to skip more bytes - we know it's a 1-byte character */
4622 }
4623 break;
4624
4625 default:
4626 RRETURN(PCRE_ERROR_INTERNAL);
4627 } /* End switch(ctype) */
4628
4629 else
4630#endif /* SUPPORT_UTF */
4631
4632 /* Code for the non-UTF-8 case for minimum matching of operators other
4633 than OP_PROP and OP_NOTPROP. */
4634
4635 switch(ctype)
4636 {
4637 case OP_ANY:
4638 for (i = 1; i <= min; i++)
4639 {
4640 if (eptr >= md->end_subject)
4641 {
4642 SCHECK_PARTIAL();
4643 RRETURN(MATCH_NOMATCH);
4644 }
4645 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4646 if (md->partial != 0 &&
4647 eptr + 1 >= md->end_subject &&
4648 NLBLOCK->nltype == NLTYPE_FIXED &&
4649 NLBLOCK->nllen == 2 &&
4650 *eptr == NLBLOCK->nl[0])
4651 {
4652 md->hitend = TRUE;
4653 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4654 }
4655 eptr++;
4656 }
4657 break;
4658
4659 case OP_ALLANY:
4660 if (eptr > md->end_subject - min)
4661 {
4662 SCHECK_PARTIAL();
4663 RRETURN(MATCH_NOMATCH);
4664 }
4665 eptr += min;
4666 break;
4667
4668 case OP_ANYBYTE:
4669 if (eptr > md->end_subject - min)
4670 {
4671 SCHECK_PARTIAL();
4672 RRETURN(MATCH_NOMATCH);
4673 }
4674 eptr += min;
4675 break;
4676
4677 case OP_ANYNL:
4678 for (i = 1; i <= min; i++)
4679 {
4680 if (eptr >= md->end_subject)
4681 {
4682 SCHECK_PARTIAL();
4683 RRETURN(MATCH_NOMATCH);
4684 }
4685 switch(*eptr++)
4686 {
4687 default: RRETURN(MATCH_NOMATCH);
4688
4689 case CHAR_CR:
4690 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4691 break;
4692
4693 case CHAR_LF:
4694 break;
4695
4696 case CHAR_VT:
4697 case CHAR_FF:
4698 case CHAR_NEL:
4699#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4700 case 0x2028:
4701 case 0x2029:
4702#endif
4703 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4704 break;
4705 }
4706 }
4707 break;
4708
4709 case OP_NOT_HSPACE:
4710 for (i = 1; i <= min; i++)
4711 {
4712 if (eptr >= md->end_subject)
4713 {
4714 SCHECK_PARTIAL();
4715 RRETURN(MATCH_NOMATCH);
4716 }
4717 switch(*eptr++)
4718 {
4719 default: break;
4720 HSPACE_BYTE_CASES:
4721#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4722 HSPACE_MULTIBYTE_CASES:
4723#endif
4724 RRETURN(MATCH_NOMATCH);
4725 }
4726 }
4727 break;
4728
4729 case OP_HSPACE:
4730 for (i = 1; i <= min; i++)
4731 {
4732 if (eptr >= md->end_subject)
4733 {
4734 SCHECK_PARTIAL();
4735 RRETURN(MATCH_NOMATCH);
4736 }
4737 switch(*eptr++)
4738 {
4739 default: RRETURN(MATCH_NOMATCH);
4740 HSPACE_BYTE_CASES:
4741#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4742 HSPACE_MULTIBYTE_CASES:
4743#endif
4744 break;
4745 }
4746 }
4747 break;
4748
4749 case OP_NOT_VSPACE:
4750 for (i = 1; i <= min; i++)
4751 {
4752 if (eptr >= md->end_subject)
4753 {
4754 SCHECK_PARTIAL();
4755 RRETURN(MATCH_NOMATCH);
4756 }
4757 switch(*eptr++)
4758 {
4759 VSPACE_BYTE_CASES:
4760#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4761 VSPACE_MULTIBYTE_CASES:
4762#endif
4763 RRETURN(MATCH_NOMATCH);
4764 default: break;
4765 }
4766 }
4767 break;
4768
4769 case OP_VSPACE:
4770 for (i = 1; i <= min; i++)
4771 {
4772 if (eptr >= md->end_subject)
4773 {
4774 SCHECK_PARTIAL();
4775 RRETURN(MATCH_NOMATCH);
4776 }
4777 switch(*eptr++)
4778 {
4779 default: RRETURN(MATCH_NOMATCH);
4780 VSPACE_BYTE_CASES:
4781#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4782 VSPACE_MULTIBYTE_CASES:
4783#endif
4784 break;
4785 }
4786 }
4787 break;
4788
4789 case OP_NOT_DIGIT:
4790 for (i = 1; i <= min; i++)
4791 {
4792 if (eptr >= md->end_subject)
4793 {
4794 SCHECK_PARTIAL();
4795 RRETURN(MATCH_NOMATCH);
4796 }
4797 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4798 RRETURN(MATCH_NOMATCH);
4799 eptr++;
4800 }
4801 break;
4802
4803 case OP_DIGIT:
4804 for (i = 1; i <= min; i++)
4805 {
4806 if (eptr >= md->end_subject)
4807 {
4808 SCHECK_PARTIAL();
4809 RRETURN(MATCH_NOMATCH);
4810 }
4811 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4812 RRETURN(MATCH_NOMATCH);
4813 eptr++;
4814 }
4815 break;
4816
4817 case OP_NOT_WHITESPACE:
4818 for (i = 1; i <= min; i++)
4819 {
4820 if (eptr >= md->end_subject)
4821 {
4822 SCHECK_PARTIAL();
4823 RRETURN(MATCH_NOMATCH);
4824 }
4825 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4826 RRETURN(MATCH_NOMATCH);
4827 eptr++;
4828 }
4829 break;
4830
4831 case OP_WHITESPACE:
4832 for (i = 1; i <= min; i++)
4833 {
4834 if (eptr >= md->end_subject)
4835 {
4836 SCHECK_PARTIAL();
4837 RRETURN(MATCH_NOMATCH);
4838 }
4839 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4840 RRETURN(MATCH_NOMATCH);
4841 eptr++;
4842 }
4843 break;
4844
4845 case OP_NOT_WORDCHAR:
4846 for (i = 1; i <= min; i++)
4847 {
4848 if (eptr >= md->end_subject)
4849 {
4850 SCHECK_PARTIAL();
4851 RRETURN(MATCH_NOMATCH);
4852 }
4853 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4854 RRETURN(MATCH_NOMATCH);
4855 eptr++;
4856 }
4857 break;
4858
4859 case OP_WORDCHAR:
4860 for (i = 1; i <= min; i++)
4861 {
4862 if (eptr >= md->end_subject)
4863 {
4864 SCHECK_PARTIAL();
4865 RRETURN(MATCH_NOMATCH);
4866 }
4867 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4868 RRETURN(MATCH_NOMATCH);
4869 eptr++;
4870 }
4871 break;
4872
4873 default:
4874 RRETURN(PCRE_ERROR_INTERNAL);
4875 }
4876 }
4877
4878 /* If min = max, continue at the same level without recursing */
4879
4880 if (min == max) continue;
4881
4882 /* If minimizing, we have to test the rest of the pattern before each
4883 subsequent match. Again, separate the UTF-8 case for speed, and also
4884 separate the UCP cases. */
4885
4886 if (minimize)
4887 {
4888#ifdef SUPPORT_UCP
4889 if (prop_type >= 0)
4890 {
4891 switch(prop_type)
4892 {
4893 case PT_ANY:
4894 for (fi = min;; fi++)
4895 {
4896 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4897 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4898 if (fi >= max) RRETURN(MATCH_NOMATCH);
4899 if (eptr >= md->end_subject)
4900 {
4901 SCHECK_PARTIAL();
4902 RRETURN(MATCH_NOMATCH);
4903 }
4904 GETCHARINCTEST(c, eptr);
4905 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4906 }
4907 /* Control never gets here */
4908
4909 case PT_LAMP:
4910 for (fi = min;; fi++)
4911 {
4912 int chartype;
4913 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4914 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4915 if (fi >= max) RRETURN(MATCH_NOMATCH);
4916 if (eptr >= md->end_subject)
4917 {
4918 SCHECK_PARTIAL();
4919 RRETURN(MATCH_NOMATCH);
4920 }
4921 GETCHARINCTEST(c, eptr);
4922 chartype = UCD_CHARTYPE(c);
4923 if ((chartype == ucp_Lu ||
4924 chartype == ucp_Ll ||
4925 chartype == ucp_Lt) == prop_fail_result)
4926 RRETURN(MATCH_NOMATCH);
4927 }
4928 /* Control never gets here */
4929
4930 case PT_GC:
4931 for (fi = min;; fi++)
4932 {
4933 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4934 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4935 if (fi >= max) RRETURN(MATCH_NOMATCH);
4936 if (eptr >= md->end_subject)
4937 {
4938 SCHECK_PARTIAL();
4939 RRETURN(MATCH_NOMATCH);
4940 }
4941 GETCHARINCTEST(c, eptr);
4942 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4943 RRETURN(MATCH_NOMATCH);
4944 }
4945 /* Control never gets here */
4946
4947 case PT_PC:
4948 for (fi = min;; fi++)
4949 {
4950 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4951 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4952 if (fi >= max) RRETURN(MATCH_NOMATCH);
4953 if (eptr >= md->end_subject)
4954 {
4955 SCHECK_PARTIAL();
4956 RRETURN(MATCH_NOMATCH);
4957 }
4958 GETCHARINCTEST(c, eptr);
4959 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4960 RRETURN(MATCH_NOMATCH);
4961 }
4962 /* Control never gets here */
4963
4964 case PT_SC:
4965 for (fi = min;; fi++)
4966 {
4967 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4968 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4969 if (fi >= max) RRETURN(MATCH_NOMATCH);
4970 if (eptr >= md->end_subject)
4971 {
4972 SCHECK_PARTIAL();
4973 RRETURN(MATCH_NOMATCH);
4974 }
4975 GETCHARINCTEST(c, eptr);
4976 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4977 RRETURN(MATCH_NOMATCH);
4978 }
4979 /* Control never gets here */
4980
4981 case PT_ALNUM:
4982 for (fi = min;; fi++)
4983 {
4984 int category;
4985 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4986 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4987 if (fi >= max) RRETURN(MATCH_NOMATCH);
4988 if (eptr >= md->end_subject)
4989 {
4990 SCHECK_PARTIAL();
4991 RRETURN(MATCH_NOMATCH);
4992 }
4993 GETCHARINCTEST(c, eptr);
4994 category = UCD_CATEGORY(c);
4995 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4996 RRETURN(MATCH_NOMATCH);
4997 }
4998 /* Control never gets here */
4999
5000 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5001 which means that Perl space and POSIX space are now identical. PCRE
5002 was changed at release 8.34. */
5003
5004 case PT_SPACE: /* Perl space */
5005 case PT_PXSPACE: /* POSIX space */
5006 for (fi = min;; fi++)
5007 {
5008 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
5009 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5010 if (fi >= max) RRETURN(MATCH_NOMATCH);
5011 if (eptr >= md->end_subject)
5012 {
5013 SCHECK_PARTIAL();
5014 RRETURN(MATCH_NOMATCH);
5015 }
5016 GETCHARINCTEST(c, eptr);
5017 switch(c)
5018 {
5019 HSPACE_CASES:
5020 VSPACE_CASES:
5021 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
5022 break;
5023
5024 default:
5025 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5026 RRETURN(MATCH_NOMATCH);
5027 break;
5028 }
5029 }
5030 /* Control never gets here */
5031
5032 case PT_WORD:
5033 for (fi = min;; fi++)
5034 {
5035 int category;
5036 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5037 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5038 if (fi >= max) RRETURN(MATCH_NOMATCH);
5039 if (eptr >= md->end_subject)
5040 {
5041 SCHECK_PARTIAL();
5042 RRETURN(MATCH_NOMATCH);
5043 }
5044 GETCHARINCTEST(c, eptr);
5045 category = UCD_CATEGORY(c);
5046 if ((category == ucp_L ||
5047 category == ucp_N ||
5048 c == CHAR_UNDERSCORE)
5049 == prop_fail_result)
5050 RRETURN(MATCH_NOMATCH);
5051 }
5052 /* Control never gets here */
5053
5054 case PT_CLIST:
5055 for (fi = min;; fi++)
5056 {
5057 const pcre_uint32 *cp;
5058 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5059 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5060 if (fi >= max) RRETURN(MATCH_NOMATCH);
5061 if (eptr >= md->end_subject)
5062 {
5063 SCHECK_PARTIAL();
5064 RRETURN(MATCH_NOMATCH);
5065 }
5066 GETCHARINCTEST(c, eptr);
5067 cp = PRIV(ucd_caseless_sets) + prop_value;
5068 for (;;)
5069 {
5070 if (c < *cp)
5071 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5072 if (c == *cp++)
5073 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5074 }
5075 }
5076 /* Control never gets here */
5077
5078 case PT_UCNC:
5079 for (fi = min;; fi++)
5080 {
5081 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
5082 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5083 if (fi >= max) RRETURN(MATCH_NOMATCH);
5084 if (eptr >= md->end_subject)
5085 {
5086 SCHECK_PARTIAL();
5087 RRETURN(MATCH_NOMATCH);
5088 }
5089 GETCHARINCTEST(c, eptr);
5090 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5091 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5092 c >= 0xe000) == prop_fail_result)
5093 RRETURN(MATCH_NOMATCH);
5094 }
5095 /* Control never gets here */
5096
5097 /* This should never occur */
5098 default:
5099 RRETURN(PCRE_ERROR_INTERNAL);
5100 }
5101 }
5102
5103 /* Match extended Unicode sequences. We will get here only if the
5104 support is in the binary; otherwise a compile-time error occurs. */
5105
5106 else if (ctype == OP_EXTUNI)
5107 {
5108 for (fi = min;; fi++)
5109 {
5110 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5111 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5112 if (fi >= max) RRETURN(MATCH_NOMATCH);
5113 if (eptr >= md->end_subject)
5114 {
5115 SCHECK_PARTIAL();
5116 RRETURN(MATCH_NOMATCH);
5117 }
5118 else
5119 {
5120 int lgb, rgb;
5121 GETCHARINCTEST(c, eptr);
5122 lgb = UCD_GRAPHBREAK(c);
5123 while (eptr < md->end_subject)
5124 {
5125 int len = 1;
5126 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5127 rgb = UCD_GRAPHBREAK(c);
5128 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5129 lgb = rgb;
5130 eptr += len;
5131 }
5132 }
5133 CHECK_PARTIAL();
5134 }
5135 }
5136 else
5137#endif /* SUPPORT_UCP */
5138
5139#ifdef SUPPORT_UTF
5140 if (utf)
5141 {
5142 for (fi = min;; fi++)
5143 {
5144 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5145 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5146 if (fi >= max) RRETURN(MATCH_NOMATCH);
5147 if (eptr >= md->end_subject)
5148 {
5149 SCHECK_PARTIAL();
5150 RRETURN(MATCH_NOMATCH);
5151 }
5152 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5153 RRETURN(MATCH_NOMATCH);
5154 GETCHARINC(c, eptr);
5155 switch(ctype)
5156 {
5157 case OP_ANY: /* This is the non-NL case */
5158 if (md->partial != 0 && /* Take care with CRLF partial */
5159 eptr >= md->end_subject &&
5160 NLBLOCK->nltype == NLTYPE_FIXED &&
5161 NLBLOCK->nllen == 2 &&
5162 c == NLBLOCK->nl[0])
5163 {
5164 md->hitend = TRUE;
5165 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5166 }
5167 break;
5168
5169 case OP_ALLANY:
5170 case OP_ANYBYTE:
5171 break;
5172
5173 case OP_ANYNL:
5174 switch(c)
5175 {
5176 default: RRETURN(MATCH_NOMATCH);
5177 case CHAR_CR:
5178 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
5179 break;
5180
5181 case CHAR_LF:
5182 break;
5183
5184 case CHAR_VT:
5185 case CHAR_FF:
5186 case CHAR_NEL:
5187#ifndef EBCDIC
5188 case 0x2028:
5189 case 0x2029:
5190#endif /* Not EBCDIC */
5191 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5192 break;
5193 }
5194 break;
5195
5196 case OP_NOT_HSPACE:
5197 switch(c)
5198 {
5199 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5200 default: break;
5201 }
5202 break;
5203
5204 case OP_HSPACE:
5205 switch(c)
5206 {
5207 HSPACE_CASES: break;
5208 default: RRETURN(MATCH_NOMATCH);
5209 }
5210 break;
5211
5212 case OP_NOT_VSPACE:
5213 switch(c)
5214 {
5215 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5216 default: break;
5217 }
5218 break;
5219
5220 case OP_VSPACE:
5221 switch(c)
5222 {
5223 VSPACE_CASES: break;
5224 default: RRETURN(MATCH_NOMATCH);
5225 }
5226 break;
5227
5228 case OP_NOT_DIGIT:
5229 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5230 RRETURN(MATCH_NOMATCH);
5231 break;
5232
5233 case OP_DIGIT:
5234 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5235 RRETURN(MATCH_NOMATCH);
5236 break;
5237
5238 case OP_NOT_WHITESPACE:
5239 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5240 RRETURN(MATCH_NOMATCH);
5241 break;
5242
5243 case OP_WHITESPACE:
5244 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5245 RRETURN(MATCH_NOMATCH);
5246 break;
5247
5248 case OP_NOT_WORDCHAR:
5249 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5250 RRETURN(MATCH_NOMATCH);
5251 break;
5252
5253 case OP_WORDCHAR:
5254 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5255 RRETURN(MATCH_NOMATCH);
5256 break;
5257
5258 default:
5259 RRETURN(PCRE_ERROR_INTERNAL);
5260 }
5261 }
5262 }
5263 else
5264#endif
5265 /* Not UTF mode */
5266 {
5267 for (fi = min;; fi++)
5268 {
5269 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5270 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5271 if (fi >= max) RRETURN(MATCH_NOMATCH);
5272 if (eptr >= md->end_subject)
5273 {
5274 SCHECK_PARTIAL();
5275 RRETURN(MATCH_NOMATCH);
5276 }
5277 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5278 RRETURN(MATCH_NOMATCH);
5279 c = *eptr++;
5280 switch(ctype)
5281 {
5282 case OP_ANY: /* This is the non-NL case */
5283 if (md->partial != 0 && /* Take care with CRLF partial */
5284 eptr >= md->end_subject &&
5285 NLBLOCK->nltype == NLTYPE_FIXED &&
5286 NLBLOCK->nllen == 2 &&
5287 c == NLBLOCK->nl[0])
5288 {
5289 md->hitend = TRUE;
5290 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5291 }
5292 break;
5293
5294 case OP_ALLANY:
5295 case OP_ANYBYTE:
5296 break;
5297
5298 case OP_ANYNL:
5299 switch(c)
5300 {
5301 default: RRETURN(MATCH_NOMATCH);
5302 case CHAR_CR:
5303 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5304 break;
5305
5306 case CHAR_LF:
5307 break;
5308
5309 case CHAR_VT:
5310 case CHAR_FF:
5311 case CHAR_NEL:
5312#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5313 case 0x2028:
5314 case 0x2029:
5315#endif
5316 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5317 break;
5318 }
5319 break;
5320
5321 case OP_NOT_HSPACE:
5322 switch(c)
5323 {
5324 default: break;
5325 HSPACE_BYTE_CASES:
5326#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5327 HSPACE_MULTIBYTE_CASES:
5328#endif
5329 RRETURN(MATCH_NOMATCH);
5330 }
5331 break;
5332
5333 case OP_HSPACE:
5334 switch(c)
5335 {
5336 default: RRETURN(MATCH_NOMATCH);
5337 HSPACE_BYTE_CASES:
5338#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5339 HSPACE_MULTIBYTE_CASES:
5340#endif
5341 break;
5342 }
5343 break;
5344
5345 case OP_NOT_VSPACE:
5346 switch(c)
5347 {
5348 default: break;
5349 VSPACE_BYTE_CASES:
5350#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5351 VSPACE_MULTIBYTE_CASES:
5352#endif
5353 RRETURN(MATCH_NOMATCH);
5354 }
5355 break;
5356
5357 case OP_VSPACE:
5358 switch(c)
5359 {
5360 default: RRETURN(MATCH_NOMATCH);
5361 VSPACE_BYTE_CASES:
5362#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5363 VSPACE_MULTIBYTE_CASES:
5364#endif
5365 break;
5366 }
5367 break;
5368
5369 case OP_NOT_DIGIT:
5370 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5371 break;
5372
5373 case OP_DIGIT:
5374 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5375 break;
5376
5377 case OP_NOT_WHITESPACE:
5378 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5379 break;
5380
5381 case OP_WHITESPACE:
5382 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5383 break;
5384
5385 case OP_NOT_WORDCHAR:
5386 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5387 break;
5388
5389 case OP_WORDCHAR:
5390 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5391 break;
5392
5393 default:
5394 RRETURN(PCRE_ERROR_INTERNAL);
5395 }
5396 }
5397 }
5398 /* Control never gets here */
5399 }
5400
5401 /* If maximizing, it is worth using inline code for speed, doing the type
5402 test once at the start (i.e. keep it out of the loop). Again, keep the
5403 UTF-8 and UCP stuff separate. */
5404
5405 else
5406 {
5407 pp = eptr; /* Remember where we started */
5408
5409#ifdef SUPPORT_UCP
5410 if (prop_type >= 0)
5411 {
5412 switch(prop_type)
5413 {
5414 case PT_ANY:
5415 for (i = min; i < max; i++)
5416 {
5417 int len = 1;
5418 if (eptr >= md->end_subject)
5419 {
5420 SCHECK_PARTIAL();
5421 break;
5422 }
5423 GETCHARLENTEST(c, eptr, len);
5424 if (prop_fail_result) break;
5425 eptr+= len;
5426 }
5427 break;
5428
5429 case PT_LAMP:
5430 for (i = min; i < max; i++)
5431 {
5432 int chartype;
5433 int len = 1;
5434 if (eptr >= md->end_subject)
5435 {
5436 SCHECK_PARTIAL();
5437 break;
5438 }
5439 GETCHARLENTEST(c, eptr, len);
5440 chartype = UCD_CHARTYPE(c);
5441 if ((chartype == ucp_Lu ||
5442 chartype == ucp_Ll ||
5443 chartype == ucp_Lt) == prop_fail_result)
5444 break;
5445 eptr+= len;
5446 }
5447 break;
5448
5449 case PT_GC:
5450 for (i = min; i < max; i++)
5451 {
5452 int len = 1;
5453 if (eptr >= md->end_subject)
5454 {
5455 SCHECK_PARTIAL();
5456 break;
5457 }
5458 GETCHARLENTEST(c, eptr, len);
5459 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5460 eptr+= len;
5461 }
5462 break;
5463
5464 case PT_PC:
5465 for (i = min; i < max; i++)
5466 {
5467 int len = 1;
5468 if (eptr >= md->end_subject)
5469 {
5470 SCHECK_PARTIAL();
5471 break;
5472 }
5473 GETCHARLENTEST(c, eptr, len);
5474 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5475 eptr+= len;
5476 }
5477 break;
5478
5479 case PT_SC:
5480 for (i = min; i < max; i++)
5481 {
5482 int len = 1;
5483 if (eptr >= md->end_subject)
5484 {
5485 SCHECK_PARTIAL();
5486 break;
5487 }
5488 GETCHARLENTEST(c, eptr, len);
5489 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5490 eptr+= len;
5491 }
5492 break;
5493
5494 case PT_ALNUM:
5495 for (i = min; i < max; i++)
5496 {
5497 int category;
5498 int len = 1;
5499 if (eptr >= md->end_subject)
5500 {
5501 SCHECK_PARTIAL();
5502 break;
5503 }
5504 GETCHARLENTEST(c, eptr, len);
5505 category = UCD_CATEGORY(c);
5506 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5507 break;
5508 eptr+= len;
5509 }
5510 break;
5511
5512 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5513 which means that Perl space and POSIX space are now identical. PCRE
5514 was changed at release 8.34. */
5515
5516 case PT_SPACE: /* Perl space */
5517 case PT_PXSPACE: /* POSIX space */
5518 for (i = min; i < max; i++)
5519 {
5520 int len = 1;
5521 if (eptr >= md->end_subject)
5522 {
5523 SCHECK_PARTIAL();
5524 break;
5525 }
5526 GETCHARLENTEST(c, eptr, len);
5527 switch(c)
5528 {
5529 HSPACE_CASES:
5530 VSPACE_CASES:
5531 if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
5532 break;
5533
5534 default:
5535 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5536 goto ENDLOOP99; /* Break the loop */
5537 break;
5538 }
5539 eptr+= len;
5540 }
5541 ENDLOOP99:
5542 break;
5543
5544 case PT_WORD:
5545 for (i = min; i < max; i++)
5546 {
5547 int category;
5548 int len = 1;
5549 if (eptr >= md->end_subject)
5550 {
5551 SCHECK_PARTIAL();
5552 break;
5553 }
5554 GETCHARLENTEST(c, eptr, len);
5555 category = UCD_CATEGORY(c);
5556 if ((category == ucp_L || category == ucp_N ||
5557 c == CHAR_UNDERSCORE) == prop_fail_result)
5558 break;
5559 eptr+= len;
5560 }
5561 break;
5562
5563 case PT_CLIST:
5564 for (i = min; i < max; i++)
5565 {
5566 const pcre_uint32 *cp;
5567 int len = 1;
5568 if (eptr >= md->end_subject)
5569 {
5570 SCHECK_PARTIAL();
5571 break;
5572 }
5573 GETCHARLENTEST(c, eptr, len);
5574 cp = PRIV(ucd_caseless_sets) + prop_value;
5575 for (;;)
5576 {
5577 if (c < *cp)
5578 { if (prop_fail_result) break; else goto GOT_MAX; }
5579 if (c == *cp++)
5580 { if (prop_fail_result) goto GOT_MAX; else break; }
5581 }
5582 eptr += len;
5583 }
5584 GOT_MAX:
5585 break;
5586
5587 case PT_UCNC:
5588 for (i = min; i < max; i++)
5589 {
5590 int len = 1;
5591 if (eptr >= md->end_subject)
5592 {
5593 SCHECK_PARTIAL();
5594 break;
5595 }
5596 GETCHARLENTEST(c, eptr, len);
5597 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5598 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5599 c >= 0xe000) == prop_fail_result)
5600 break;
5601 eptr += len;
5602 }
5603 break;
5604
5605 default:
5606 RRETURN(PCRE_ERROR_INTERNAL);
5607 }
5608
5609 /* eptr is now past the end of the maximum run */
5610
5611 if (possessive) continue; /* No backtracking */
5612 for(;;)
5613 {
5614 if (eptr <= pp) goto TAIL_RECURSE;
5615 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5616 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5617 eptr--;
5618 if (utf) BACKCHAR(eptr);
5619 }
5620 }
5621
5622 /* Match extended Unicode grapheme clusters. We will get here only if the
5623 support is in the binary; otherwise a compile-time error occurs. */
5624
5625 else if (ctype == OP_EXTUNI)
5626 {
5627 for (i = min; i < max; i++)
5628 {
5629 if (eptr >= md->end_subject)
5630 {
5631 SCHECK_PARTIAL();
5632 break;
5633 }
5634 else
5635 {
5636 int lgb, rgb;
5637 GETCHARINCTEST(c, eptr);
5638 lgb = UCD_GRAPHBREAK(c);
5639 while (eptr < md->end_subject)
5640 {
5641 int len = 1;
5642 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5643 rgb = UCD_GRAPHBREAK(c);
5644 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5645 lgb = rgb;
5646 eptr += len;
5647 }
5648 }
5649 CHECK_PARTIAL();
5650 }
5651
5652 /* eptr is now past the end of the maximum run */
5653
5654 if (possessive) continue; /* No backtracking */
5655
5656 /* We use <= pp rather than == pp to detect the start of the run while
5657 backtracking because the use of \C in UTF mode can cause BACKCHAR to
5658 move back past pp. This is just palliative; the use of \C in UTF mode
5659 is fraught with danger. */
5660
5661 for(;;)
5662 {
5663 int lgb, rgb;
5664 PCRE_PUCHAR fptr;
5665
5666 if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
5667 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5668 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5669
5670 /* Backtracking over an extended grapheme cluster involves inspecting
5671 the previous two characters (if present) to see if a break is
5672 permitted between them. */
5673
5674 eptr--;
5675 if (!utf) c = *eptr; else
5676 {
5677 BACKCHAR(eptr);
5678 GETCHAR(c, eptr);
5679 }
5680 rgb = UCD_GRAPHBREAK(c);
5681
5682 for (;;)
5683 {
5684 if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
5685 fptr = eptr - 1;
5686 if (!utf) c = *fptr; else
5687 {
5688 BACKCHAR(fptr);
5689 GETCHAR(c, fptr);
5690 }
5691 lgb = UCD_GRAPHBREAK(c);
5692 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5693 eptr = fptr;
5694 rgb = lgb;
5695 }
5696 }
5697 }
5698
5699 else
5700#endif /* SUPPORT_UCP */
5701
5702#ifdef SUPPORT_UTF
5703 if (utf)
5704 {
5705 switch(ctype)
5706 {
5707 case OP_ANY:
5708 for (i = min; i < max; i++)
5709 {
5710 if (eptr >= md->end_subject)
5711 {
5712 SCHECK_PARTIAL();
5713 break;
5714 }
5715 if (IS_NEWLINE(eptr)) break;
5716 if (md->partial != 0 && /* Take care with CRLF partial */
5717 eptr + 1 >= md->end_subject &&
5718 NLBLOCK->nltype == NLTYPE_FIXED &&
5719 NLBLOCK->nllen == 2 &&
5720 UCHAR21(eptr) == NLBLOCK->nl[0])
5721 {
5722 md->hitend = TRUE;
5723 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5724 }
5725 eptr++;
5726 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5727 }
5728 break;
5729
5730 case OP_ALLANY:
5731 if (max < INT_MAX)
5732 {
5733 for (i = min; i < max; i++)
5734 {
5735 if (eptr >= md->end_subject)
5736 {
5737 SCHECK_PARTIAL();
5738 break;
5739 }
5740 eptr++;
5741 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5742 }
5743 }
5744 else
5745 {
5746 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5747 SCHECK_PARTIAL();
5748 }
5749 break;
5750
5751 /* The byte case is the same as non-UTF8 */
5752
5753 case OP_ANYBYTE:
5754 c = max - min;
5755 if (c > (unsigned int)(md->end_subject - eptr))
5756 {
5757 eptr = md->end_subject;
5758 SCHECK_PARTIAL();
5759 }
5760 else eptr += c;
5761 break;
5762
5763 case OP_ANYNL:
5764 for (i = min; i < max; i++)
5765 {
5766 int len = 1;
5767 if (eptr >= md->end_subject)
5768 {
5769 SCHECK_PARTIAL();
5770 break;
5771 }
5772 GETCHARLEN(c, eptr, len);
5773 if (c == CHAR_CR)
5774 {
5775 if (++eptr >= md->end_subject) break;
5776 if (UCHAR21(eptr) == CHAR_LF) eptr++;
5777 }
5778 else
5779 {
5780 if (c != CHAR_LF &&
5781 (md->bsr_anycrlf ||
5782 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5783#ifndef EBCDIC
5784 && c != 0x2028 && c != 0x2029
5785#endif /* Not EBCDIC */
5786 )))
5787 break;
5788 eptr += len;
5789 }
5790 }
5791 break;
5792
5793 case OP_NOT_HSPACE:
5794 case OP_HSPACE:
5795 for (i = min; i < max; i++)
5796 {
5797 BOOL gotspace;
5798 int len = 1;
5799 if (eptr >= md->end_subject)
5800 {
5801 SCHECK_PARTIAL();
5802 break;
5803 }
5804 GETCHARLEN(c, eptr, len);
5805 switch(c)
5806 {
5807 HSPACE_CASES: gotspace = TRUE; break;
5808 default: gotspace = FALSE; break;
5809 }
5810 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5811 eptr += len;
5812 }
5813 break;
5814
5815 case OP_NOT_VSPACE:
5816 case OP_VSPACE:
5817 for (i = min; i < max; i++)
5818 {
5819 BOOL gotspace;
5820 int len = 1;
5821 if (eptr >= md->end_subject)
5822 {
5823 SCHECK_PARTIAL();
5824 break;
5825 }
5826 GETCHARLEN(c, eptr, len);
5827 switch(c)
5828 {
5829 VSPACE_CASES: gotspace = TRUE; break;
5830 default: gotspace = FALSE; break;
5831 }
5832 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5833 eptr += len;
5834 }
5835 break;
5836
5837 case OP_NOT_DIGIT:
5838 for (i = min; i < max; i++)
5839 {
5840 int len = 1;
5841 if (eptr >= md->end_subject)
5842 {
5843 SCHECK_PARTIAL();
5844 break;
5845 }
5846 GETCHARLEN(c, eptr, len);
5847 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5848 eptr+= len;
5849 }
5850 break;
5851
5852 case OP_DIGIT:
5853 for (i = min; i < max; i++)
5854 {
5855 int len = 1;
5856 if (eptr >= md->end_subject)
5857 {
5858 SCHECK_PARTIAL();
5859 break;
5860 }
5861 GETCHARLEN(c, eptr, len);
5862 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5863 eptr+= len;
5864 }
5865 break;
5866
5867 case OP_NOT_WHITESPACE:
5868 for (i = min; i < max; i++)
5869 {
5870 int len = 1;
5871 if (eptr >= md->end_subject)
5872 {
5873 SCHECK_PARTIAL();
5874 break;
5875 }
5876 GETCHARLEN(c, eptr, len);
5877 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5878 eptr+= len;
5879 }
5880 break;
5881
5882 case OP_WHITESPACE:
5883 for (i = min; i < max; i++)
5884 {
5885 int len = 1;
5886 if (eptr >= md->end_subject)
5887 {
5888 SCHECK_PARTIAL();
5889 break;
5890 }
5891 GETCHARLEN(c, eptr, len);
5892 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5893 eptr+= len;
5894 }
5895 break;
5896
5897 case OP_NOT_WORDCHAR:
5898 for (i = min; i < max; i++)
5899 {
5900 int len = 1;
5901 if (eptr >= md->end_subject)
5902 {
5903 SCHECK_PARTIAL();
5904 break;
5905 }
5906 GETCHARLEN(c, eptr, len);
5907 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5908 eptr+= len;
5909 }
5910 break;
5911
5912 case OP_WORDCHAR:
5913 for (i = min; i < max; i++)
5914 {
5915 int len = 1;
5916 if (eptr >= md->end_subject)
5917 {
5918 SCHECK_PARTIAL();
5919 break;
5920 }
5921 GETCHARLEN(c, eptr, len);
5922 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5923 eptr+= len;
5924 }
5925 break;
5926
5927 default:
5928 RRETURN(PCRE_ERROR_INTERNAL);
5929 }
5930
5931 if (possessive) continue; /* No backtracking */
5932 for(;;)
5933 {
5934 if (eptr <= pp) goto TAIL_RECURSE;
5935 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5936 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5937 eptr--;
5938 BACKCHAR(eptr);
5939 if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL &&
5940 UCHAR21(eptr - 1) == CHAR_CR) eptr--;
5941 }
5942 }
5943 else
5944#endif /* SUPPORT_UTF */
5945 /* Not UTF mode */
5946 {
5947 switch(ctype)
5948 {
5949 case OP_ANY:
5950 for (i = min; i < max; i++)
5951 {
5952 if (eptr >= md->end_subject)
5953 {
5954 SCHECK_PARTIAL();
5955 break;
5956 }
5957 if (IS_NEWLINE(eptr)) break;
5958 if (md->partial != 0 && /* Take care with CRLF partial */
5959 eptr + 1 >= md->end_subject &&
5960 NLBLOCK->nltype == NLTYPE_FIXED &&
5961 NLBLOCK->nllen == 2 &&
5962 *eptr == NLBLOCK->nl[0])
5963 {
5964 md->hitend = TRUE;
5965 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5966 }
5967 eptr++;
5968 }
5969 break;
5970
5971 case OP_ALLANY:
5972 case OP_ANYBYTE:
5973 c = max - min;
5974 if (c > (unsigned int)(md->end_subject - eptr))
5975 {
5976 eptr = md->end_subject;
5977 SCHECK_PARTIAL();
5978 }
5979 else eptr += c;
5980 break;
5981
5982 case OP_ANYNL:
5983 for (i = min; i < max; i++)
5984 {
5985 if (eptr >= md->end_subject)
5986 {
5987 SCHECK_PARTIAL();
5988 break;
5989 }
5990 c = *eptr;
5991 if (c == CHAR_CR)
5992 {
5993 if (++eptr >= md->end_subject) break;
5994 if (*eptr == CHAR_LF) eptr++;
5995 }
5996 else
5997 {
5998 if (c != CHAR_LF && (md->bsr_anycrlf ||
5999 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
6000#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6001 && c != 0x2028 && c != 0x2029
6002#endif
6003 ))) break;
6004 eptr++;
6005 }
6006 }
6007 break;
6008
6009 case OP_NOT_HSPACE:
6010 for (i = min; i < max; i++)
6011 {
6012 if (eptr >= md->end_subject)
6013 {
6014 SCHECK_PARTIAL();
6015 break;
6016 }
6017 switch(*eptr)
6018 {
6019 default: eptr++; break;
6020 HSPACE_BYTE_CASES:
6021#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6022 HSPACE_MULTIBYTE_CASES:
6023#endif
6024 goto ENDLOOP00;
6025 }
6026 }
6027 ENDLOOP00:
6028 break;
6029
6030 case OP_HSPACE:
6031 for (i = min; i < max; i++)
6032 {
6033 if (eptr >= md->end_subject)
6034 {
6035 SCHECK_PARTIAL();
6036 break;
6037 }
6038 switch(*eptr)
6039 {
6040 default: goto ENDLOOP01;
6041 HSPACE_BYTE_CASES:
6042#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6043 HSPACE_MULTIBYTE_CASES:
6044#endif
6045 eptr++; break;
6046 }
6047 }
6048 ENDLOOP01:
6049 break;
6050
6051 case OP_NOT_VSPACE:
6052 for (i = min; i < max; i++)
6053 {
6054 if (eptr >= md->end_subject)
6055 {
6056 SCHECK_PARTIAL();
6057 break;
6058 }
6059 switch(*eptr)
6060 {
6061 default: eptr++; break;
6062 VSPACE_BYTE_CASES:
6063#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6064 VSPACE_MULTIBYTE_CASES:
6065#endif
6066 goto ENDLOOP02;
6067 }
6068 }
6069 ENDLOOP02:
6070 break;
6071
6072 case OP_VSPACE:
6073 for (i = min; i < max; i++)
6074 {
6075 if (eptr >= md->end_subject)
6076 {
6077 SCHECK_PARTIAL();
6078 break;
6079 }
6080 switch(*eptr)
6081 {
6082 default: goto ENDLOOP03;
6083 VSPACE_BYTE_CASES:
6084#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6085 VSPACE_MULTIBYTE_CASES:
6086#endif
6087 eptr++; break;
6088 }
6089 }
6090 ENDLOOP03:
6091 break;
6092
6093 case OP_NOT_DIGIT:
6094 for (i = min; i < max; i++)
6095 {
6096 if (eptr >= md->end_subject)
6097 {
6098 SCHECK_PARTIAL();
6099 break;
6100 }
6101 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6102 eptr++;
6103 }
6104 break;
6105
6106 case OP_DIGIT:
6107 for (i = min; i < max; i++)
6108 {
6109 if (eptr >= md->end_subject)
6110 {
6111 SCHECK_PARTIAL();
6112 break;
6113 }
6114 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6115 eptr++;
6116 }
6117 break;
6118
6119 case OP_NOT_WHITESPACE:
6120 for (i = min; i < max; i++)
6121 {
6122 if (eptr >= md->end_subject)
6123 {
6124 SCHECK_PARTIAL();
6125 break;
6126 }
6127 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6128 eptr++;
6129 }
6130 break;
6131
6132 case OP_WHITESPACE:
6133 for (i = min; i < max; i++)
6134 {
6135 if (eptr >= md->end_subject)
6136 {
6137 SCHECK_PARTIAL();
6138 break;
6139 }
6140 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6141 eptr++;
6142 }
6143 break;
6144
6145 case OP_NOT_WORDCHAR:
6146 for (i = min; i < max; i++)
6147 {
6148 if (eptr >= md->end_subject)
6149 {
6150 SCHECK_PARTIAL();
6151 break;
6152 }
6153 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6154 eptr++;
6155 }
6156 break;
6157
6158 case OP_WORDCHAR:
6159 for (i = min; i < max; i++)
6160 {
6161 if (eptr >= md->end_subject)
6162 {
6163 SCHECK_PARTIAL();
6164 break;
6165 }
6166 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6167 eptr++;
6168 }
6169 break;
6170
6171 default:
6172 RRETURN(PCRE_ERROR_INTERNAL);
6173 }
6174
6175 if (possessive) continue; /* No backtracking */
6176 for (;;)
6177 {
6178 if (eptr == pp) goto TAIL_RECURSE;
6179 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6180 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6181 eptr--;
6182 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6183 eptr[-1] == CHAR_CR) eptr--;
6184 }
6185 }
6186
6187 /* Control never gets here */
6188 }
6189
6190 /* There's been some horrible disaster. Arrival here can only mean there is
6191 something seriously wrong in the code above or the OP_xxx definitions. */
6192
6193 default:
6194 DPRINTF(("Unknown opcode %d\n", *ecode));
6195 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6196 }
6197
6198 /* Do not stick any code in here without much thought; it is assumed
6199 that "continue" in the code above comes out to here to repeat the main
6200 loop. */
6201
6202 } /* End of main loop */
6203/* Control never reaches here */
6204
6205
6206/* When compiling to use the heap rather than the stack for recursive calls to
6207match(), the RRETURN() macro jumps here. The number that is saved in
6208frame->Xwhere indicates which label we actually want to return to. */
6209
6210#ifdef NO_RECURSE
6211#define LBL(val) case val: goto L_RM##val;
6212HEAP_RETURN:
6213switch (frame->Xwhere)
6214 {
6215 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6216 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6217 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6218 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6219 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6220 LBL(65) LBL(66)
6221#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6222 LBL(20) LBL(21)
6223#endif
6224#ifdef SUPPORT_UTF
6225 LBL(16) LBL(18)
6226 LBL(22) LBL(23) LBL(28) LBL(30)
6227 LBL(32) LBL(34) LBL(42) LBL(46)
6228#ifdef SUPPORT_UCP
6229 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6230 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6231#endif /* SUPPORT_UCP */
6232#endif /* SUPPORT_UTF */
6233 default:
6234 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6235 return PCRE_ERROR_INTERNAL;
6236 }
6237#undef LBL
6238#endif /* NO_RECURSE */
6239}
6240
6241
6242/***************************************************************************
6243****************************************************************************
6244 RECURSION IN THE match() FUNCTION
6245
6246Undefine all the macros that were defined above to handle this. */
6247
6248#ifdef NO_RECURSE
6249#undef eptr
6250#undef ecode
6251#undef mstart
6252#undef offset_top
6253#undef eptrb
6254#undef flags
6255
6256#undef callpat
6257#undef charptr
6258#undef data
6259#undef next
6260#undef pp
6261#undef prev
6262#undef saved_eptr
6263
6264#undef new_recursive
6265
6266#undef cur_is_word
6267#undef condition
6268#undef prev_is_word
6269
6270#undef ctype
6271#undef length
6272#undef max
6273#undef min
6274#undef number
6275#undef offset
6276#undef op
6277#undef save_capture_last
6278#undef save_offset1
6279#undef save_offset2
6280#undef save_offset3
6281#undef stacksave
6282
6283#undef newptrb
6284
6285#endif
6286
6287/* These two are defined as macros in both cases */
6288
6289#undef fc
6290#undef fi
6291
6292/***************************************************************************
6293***************************************************************************/
6294
6295
6296#ifdef NO_RECURSE
6297/*************************************************
6298* Release allocated heap frames *
6299*************************************************/
6300
6301/* This function releases all the allocated frames. The base frame is on the
6302machine stack, and so must not be freed.
6303
6304Argument: the address of the base frame
6305Returns: nothing
6306*/
6307
6308static void
6309release_match_heapframes (heapframe *frame_base)
6310{
6311heapframe *nextframe = frame_base->Xnextframe;
6312while (nextframe != NULL)
6313 {
6314 heapframe *oldframe = nextframe;
6315 nextframe = nextframe->Xnextframe;
6316 (PUBL(stack_free))(oldframe);
6317 }
6318}
6319#endif
6320
6321
6322/*************************************************
6323* Execute a Regular Expression *
6324*************************************************/
6325
6326/* This function applies a compiled re to a subject string and picks out
6327portions of the string if it matches. Two elements in the vector are set for
6328each substring: the offsets to the start and end of the substring.
6329
6330Arguments:
6331 argument_re points to the compiled expression
6332 extra_data points to extra data or is NULL
6333 subject points to the subject string
6334 length length of subject string (may contain binary zeros)
6335 start_offset where to start in the subject string
6336 options option bits
6337 offsets points to a vector of ints to be filled in with offsets
6338 offsetcount the number of elements in the vector
6339
6340Returns: > 0 => success; value is the number of elements filled in
6341 = 0 => success, but offsets is not big enough
6342 -1 => failed to match
6343 < -1 => some kind of unexpected problem
6344*/
6345
6346#if defined COMPILE_PCRE8
6347PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6348pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6349 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6350 int offsetcount)
6351#elif defined COMPILE_PCRE16
6352PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6353pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6354 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6355 int offsetcount)
6356#elif defined COMPILE_PCRE32
6357PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6358pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6359 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6360 int offsetcount)
6361#endif
6362{
6363int rc, ocount, arg_offset_max;
6364int newline;
6365BOOL using_temporary_offsets = FALSE;
6366BOOL anchored;
6367BOOL startline;
6368BOOL firstline;
6369BOOL utf;
6370BOOL has_first_char = FALSE;
6371BOOL has_req_char = FALSE;
6372pcre_uchar first_char = 0;
6373pcre_uchar first_char2 = 0;
6374pcre_uchar req_char = 0;
6375pcre_uchar req_char2 = 0;
6376match_data match_block;
6377match_data *md = &match_block;
6378const pcre_uint8 *tables;
6379const pcre_uint8 *start_bits = NULL;
6380PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6381PCRE_PUCHAR end_subject;
6382PCRE_PUCHAR start_partial = NULL;
6383PCRE_PUCHAR match_partial = NULL;
6384PCRE_PUCHAR req_char_ptr = start_match - 1;
6385
6386const pcre_study_data *study;
6387const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6388
6389#ifdef NO_RECURSE
6390heapframe frame_zero;
6391frame_zero.Xprevframe = NULL; /* Marks the top level */
6392frame_zero.Xnextframe = NULL; /* None are allocated yet */
6393md->match_frames_base = &frame_zero;
6394#endif
6395
6396/* Check for the special magic call that measures the size of the stack used
6397per recursive call of match(). Without the funny casting for sizeof, a Windows
6398compiler gave this error: "unary minus operator applied to unsigned type,
6399result still unsigned". Hopefully the cast fixes that. */
6400
6401if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6402 start_offset == -999)
6403#ifdef NO_RECURSE
6404 return -((int)sizeof(heapframe));
6405#else
6406 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6407#endif
6408
6409/* Plausibility checks */
6410
6411if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6412if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6413 return PCRE_ERROR_NULL;
6414if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6415if (length < 0) return PCRE_ERROR_BADLENGTH;
6416if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6417
6418/* Check that the first field in the block is the magic number. If it is not,
6419return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6420REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6421means that the pattern is likely compiled with different endianness. */
6422
6423if (re->magic_number != MAGIC_NUMBER)
6424 return re->magic_number == REVERSED_MAGIC_NUMBER?
6425 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6426if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6427
6428/* These two settings are used in the code for checking a UTF-8 string that
6429follows immediately afterwards. Other values in the md block are used only
6430during "normal" pcre_exec() processing, not when the JIT support is in use,
6431so they are set up later. */
6432
6433/* PCRE_UTF16 has the same value as PCRE_UTF8. */
6434utf = md->utf = (re->options & PCRE_UTF8) != 0;
6435md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6436 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6437
6438/* Check a UTF-8 string if required. Pass back the character offset and error
6439code for an invalid string if a results vector is available. */
6440
6441#ifdef SUPPORT_UTF
6442if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6443 {
6444 int erroroffset;
6445 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6446 if (errorcode != 0)
6447 {
6448 if (offsetcount >= 2)
6449 {
6450 offsets[0] = erroroffset;
6451 offsets[1] = errorcode;
6452 }
6453#if defined COMPILE_PCRE8
6454 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6455 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6456#elif defined COMPILE_PCRE16
6457 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6458 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6459#elif defined COMPILE_PCRE32
6460 return PCRE_ERROR_BADUTF32;
6461#endif
6462 }
6463#if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6464 /* Check that a start_offset points to the start of a UTF character. */
6465 if (start_offset > 0 && start_offset < length &&
6466 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6467 return PCRE_ERROR_BADUTF8_OFFSET;
6468#endif
6469 }
6470#endif
6471
6472/* If the pattern was successfully studied with JIT support, run the JIT
6473executable instead of the rest of this function. Most options must be set at
6474compile time for the JIT code to be usable. Fallback to the normal code path if
6475an unsupported flag is set. */
6476
6477#ifdef SUPPORT_JIT
6478if (extra_data != NULL
6479 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6480 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6481 && extra_data->executable_jit != NULL
6482 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6483 {
6484 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6485 start_offset, options, offsets, offsetcount);
6486
6487 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6488 mode is not compiled. In this case we simply fallback to interpreter. */
6489
6490 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6491 }
6492#endif
6493
6494/* Carry on with non-JIT matching. This information is for finding all the
6495numbers associated with a given name, for condition testing. */
6496
6497md->name_table = (pcre_uchar *)re + re->name_table_offset;
6498md->name_count = re->name_count;
6499md->name_entry_size = re->name_entry_size;
6500
6501/* Fish out the optional data from the extra_data structure, first setting
6502the default values. */
6503
6504study = NULL;
6505md->match_limit = MATCH_LIMIT;
6506md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6507md->callout_data = NULL;
6508
6509/* The table pointer is always in native byte order. */
6510
6511tables = re->tables;
6512
6513/* The two limit values override the defaults, whatever their value. */
6514
6515if (extra_data != NULL)
6516 {
6517 unsigned long int flags = extra_data->flags;
6518 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6519 study = (const pcre_study_data *)extra_data->study_data;
6520 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6521 md->match_limit = extra_data->match_limit;
6522 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6523 md->match_limit_recursion = extra_data->match_limit_recursion;
6524 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6525 md->callout_data = extra_data->callout_data;
6526 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6527 }
6528
6529/* Limits in the regex override only if they are smaller. */
6530
6531if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
6532 md->match_limit = re->limit_match;
6533
6534if ((re->flags & PCRE_RLSET) != 0 &&
6535 re->limit_recursion < md->match_limit_recursion)
6536 md->match_limit_recursion = re->limit_recursion;
6537
6538/* If the exec call supplied NULL for tables, use the inbuilt ones. This
6539is a feature that makes it possible to save compiled regex and re-use them
6540in other programs later. */
6541
6542if (tables == NULL) tables = PRIV(default_tables);
6543
6544/* Set up other data */
6545
6546anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6547startline = (re->flags & PCRE_STARTLINE) != 0;
6548firstline = (re->options & PCRE_FIRSTLINE) != 0;
6549
6550/* The code starts after the real_pcre block and the capture name table. */
6551
6552md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6553 re->name_count * re->name_entry_size;
6554
6555md->start_subject = (PCRE_PUCHAR)subject;
6556md->start_offset = start_offset;
6557md->end_subject = md->start_subject + length;
6558end_subject = md->end_subject;
6559
6560md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6561md->use_ucp = (re->options & PCRE_UCP) != 0;
6562md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6563md->ignore_skip_arg = 0;
6564
6565/* Some options are unpacked into BOOL variables in the hope that testing
6566them will be faster than individual option bits. */
6567
6568md->notbol = (options & PCRE_NOTBOL) != 0;
6569md->noteol = (options & PCRE_NOTEOL) != 0;
6570md->notempty = (options & PCRE_NOTEMPTY) != 0;
6571md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6572
6573md->hitend = FALSE;
6574md->mark = md->nomatch_mark = NULL; /* In case never set */
6575
6576md->recursive = NULL; /* No recursion at top level */
6577md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6578
6579md->lcc = tables + lcc_offset;
6580md->fcc = tables + fcc_offset;
6581md->ctypes = tables + ctypes_offset;
6582
6583/* Handle different \R options. */
6584
6585switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6586 {
6587 case 0:
6588 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6589 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6590 else
6591#ifdef BSR_ANYCRLF
6592 md->bsr_anycrlf = TRUE;
6593#else
6594 md->bsr_anycrlf = FALSE;
6595#endif
6596 break;
6597
6598 case PCRE_BSR_ANYCRLF:
6599 md->bsr_anycrlf = TRUE;
6600 break;
6601
6602 case PCRE_BSR_UNICODE:
6603 md->bsr_anycrlf = FALSE;
6604 break;
6605
6606 default: return PCRE_ERROR_BADNEWLINE;
6607 }
6608
6609/* Handle different types of newline. The three bits give eight cases. If
6610nothing is set at run time, whatever was used at compile time applies. */
6611
6612switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6613 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6614 {
6615 case 0: newline = NEWLINE; break; /* Compile-time default */
6616 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6617 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6618 case PCRE_NEWLINE_CR+
6619 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6620 case PCRE_NEWLINE_ANY: newline = -1; break;
6621 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6622 default: return PCRE_ERROR_BADNEWLINE;
6623 }
6624
6625if (newline == -2)
6626 {
6627 md->nltype = NLTYPE_ANYCRLF;
6628 }
6629else if (newline < 0)
6630 {
6631 md->nltype = NLTYPE_ANY;
6632 }
6633else
6634 {
6635 md->nltype = NLTYPE_FIXED;
6636 if (newline > 255)
6637 {
6638 md->nllen = 2;
6639 md->nl[0] = (newline >> 8) & 255;
6640 md->nl[1] = newline & 255;
6641 }
6642 else
6643 {
6644 md->nllen = 1;
6645 md->nl[0] = newline;
6646 }
6647 }
6648
6649/* Partial matching was originally supported only for a restricted set of
6650regexes; from release 8.00 there are no restrictions, but the bits are still
6651defined (though never set). So there's no harm in leaving this code. */
6652
6653if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6654 return PCRE_ERROR_BADPARTIAL;
6655
6656/* If the expression has got more back references than the offsets supplied can
6657hold, we get a temporary chunk of working store to use during the matching.
6658Otherwise, we can use the vector supplied, rounding down its size to a multiple
6659of 3. */
6660
6661ocount = offsetcount - (offsetcount % 3);
6662arg_offset_max = (2*ocount)/3;
6663
6664if (re->top_backref > 0 && re->top_backref >= ocount/3)
6665 {
6666 ocount = re->top_backref * 3 + 3;
6667 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6668 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6669 using_temporary_offsets = TRUE;
6670 DPRINTF(("Got memory to hold back references\n"));
6671 }
6672else md->offset_vector = offsets;
6673md->offset_end = ocount;
6674md->offset_max = (2*ocount)/3;
6675md->capture_last = 0;
6676
6677/* Reset the working variable associated with each extraction. These should
6678never be used unless previously set, but they get saved and restored, and so we
6679initialize them to avoid reading uninitialized locations. Also, unset the
6680offsets for the matched string. This is really just for tidiness with callouts,
6681in case they inspect these fields. */
6682
6683if (md->offset_vector != NULL)
6684 {
6685 register int *iptr = md->offset_vector + ocount;
6686 register int *iend = iptr - re->top_bracket;
6687 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6688 while (--iptr >= iend) *iptr = -1;
6689 if (offsetcount > 0) md->offset_vector[0] = -1;
6690 if (offsetcount > 1) md->offset_vector[1] = -1;
6691 }
6692
6693/* Set up the first character to match, if available. The first_char value is
6694never set for an anchored regular expression, but the anchoring may be forced
6695at run time, so we have to test for anchoring. The first char may be unset for
6696an unanchored pattern, of course. If there's no first char and the pattern was
6697studied, there may be a bitmap of possible first characters. */
6698
6699if (!anchored)
6700 {
6701 if ((re->flags & PCRE_FIRSTSET) != 0)
6702 {
6703 has_first_char = TRUE;
6704 first_char = first_char2 = (pcre_uchar)(re->first_char);
6705 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6706 {
6707 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6708#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6709 if (utf && first_char > 127)
6710 first_char2 = UCD_OTHERCASE(first_char);
6711#endif
6712 }
6713 }
6714 else
6715 if (!startline && study != NULL &&
6716 (study->flags & PCRE_STUDY_MAPPED) != 0)
6717 start_bits = study->start_bits;
6718 }
6719
6720/* For anchored or unanchored matches, there may be a "last known required
6721character" set. */
6722
6723if ((re->flags & PCRE_REQCHSET) != 0)
6724 {
6725 has_req_char = TRUE;
6726 req_char = req_char2 = (pcre_uchar)(re->req_char);
6727 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6728 {
6729 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6730#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6731 if (utf && req_char > 127)
6732 req_char2 = UCD_OTHERCASE(req_char);
6733#endif
6734 }
6735 }
6736
6737
6738/* ==========================================================================*/
6739
6740/* Loop for handling unanchored repeated matching attempts; for anchored regexs
6741the loop runs just once. */
6742
6743for(;;)
6744 {
6745 PCRE_PUCHAR save_end_subject = end_subject;
6746 PCRE_PUCHAR new_start_match;
6747
6748 /* If firstline is TRUE, the start of the match is constrained to the first
6749 line of a multiline string. That is, the match must be before or at the first
6750 newline. Implement this by temporarily adjusting end_subject so that we stop
6751 scanning at a newline. If the match fails at the newline, later code breaks
6752 this loop. */
6753
6754 if (firstline)
6755 {
6756 PCRE_PUCHAR t = start_match;
6757#ifdef SUPPORT_UTF
6758 if (utf)
6759 {
6760 while (t < md->end_subject && !IS_NEWLINE(t))
6761 {
6762 t++;
6763 ACROSSCHAR(t < end_subject, *t, t++);
6764 }
6765 }
6766 else
6767#endif
6768 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6769 end_subject = t;
6770 }
6771
6772 /* There are some optimizations that avoid running the match if a known
6773 starting point is not found, or if a known later character is not present.
6774 However, there is an option that disables these, for testing and for ensuring
6775 that all callouts do actually occur. The option can be set in the regex by
6776 (*NO_START_OPT) or passed in match-time options. */
6777
6778 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6779 {
6780 /* Advance to a unique first char if there is one. */
6781
6782 if (has_first_char)
6783 {
6784 pcre_uchar smc;
6785
6786 if (first_char != first_char2)
6787 while (start_match < end_subject &&
6788 (smc = UCHAR21TEST(start_match)) != first_char && smc != first_char2)
6789 start_match++;
6790 else
6791 while (start_match < end_subject && UCHAR21TEST(start_match) != first_char)
6792 start_match++;
6793 }
6794
6795 /* Or to just after a linebreak for a multiline match */
6796
6797 else if (startline)
6798 {
6799 if (start_match > md->start_subject + start_offset)
6800 {
6801#ifdef SUPPORT_UTF
6802 if (utf)
6803 {
6804 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6805 {
6806 start_match++;
6807 ACROSSCHAR(start_match < end_subject, *start_match,
6808 start_match++);
6809 }
6810 }
6811 else
6812#endif
6813 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6814 start_match++;
6815
6816 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6817 and we are now at a LF, advance the match position by one more character.
6818 */
6819
6820 if (start_match[-1] == CHAR_CR &&
6821 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6822 start_match < end_subject &&
6823 UCHAR21TEST(start_match) == CHAR_NL)
6824 start_match++;
6825 }
6826 }
6827
6828 /* Or to a non-unique first byte after study */
6829
6830 else if (start_bits != NULL)
6831 {
6832 while (start_match < end_subject)
6833 {
6834 register pcre_uint32 c = UCHAR21TEST(start_match);
6835#ifndef COMPILE_PCRE8
6836 if (c > 255) c = 255;
6837#endif
6838 if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
6839 start_match++;
6840 }
6841 }
6842 } /* Starting optimizations */
6843
6844 /* Restore fudged end_subject */
6845
6846 end_subject = save_end_subject;
6847
6848 /* The following two optimizations are disabled for partial matching or if
6849 disabling is explicitly requested. */
6850
6851 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6852 {
6853 /* If the pattern was studied, a minimum subject length may be set. This is
6854 a lower bound; no actual string of that length may actually match the
6855 pattern. Although the value is, strictly, in characters, we treat it as
6856 bytes to avoid spending too much time in this optimization. */
6857
6858 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6859 (pcre_uint32)(end_subject - start_match) < study->minlength)
6860 {
6861 rc = MATCH_NOMATCH;
6862 break;
6863 }
6864
6865 /* If req_char is set, we know that that character must appear in the
6866 subject for the match to succeed. If the first character is set, req_char
6867 must be later in the subject; otherwise the test starts at the match point.
6868 This optimization can save a huge amount of backtracking in patterns with
6869 nested unlimited repeats that aren't going to match. Writing separate code
6870 for cased/caseless versions makes it go faster, as does using an
6871 autoincrement and backing off on a match.
6872
6873 HOWEVER: when the subject string is very, very long, searching to its end
6874 can take a long time, and give bad performance on quite ordinary patterns.
6875 This showed up when somebody was matching something like /^\d+C/ on a
6876 32-megabyte string... so we don't do this when the string is sufficiently
6877 long. */
6878
6879 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6880 {
6881 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6882
6883 /* We don't need to repeat the search if we haven't yet reached the
6884 place we found it at last time. */
6885
6886 if (p > req_char_ptr)
6887 {
6888 if (req_char != req_char2)
6889 {
6890 while (p < end_subject)
6891 {
6892 register pcre_uint32 pp = UCHAR21INCTEST(p);
6893 if (pp == req_char || pp == req_char2) { p--; break; }
6894 }
6895 }
6896 else
6897 {
6898 while (p < end_subject)
6899 {
6900 if (UCHAR21INCTEST(p) == req_char) { p--; break; }
6901 }
6902 }
6903
6904 /* If we can't find the required character, break the matching loop,
6905 forcing a match failure. */
6906
6907 if (p >= end_subject)
6908 {
6909 rc = MATCH_NOMATCH;
6910 break;
6911 }
6912
6913 /* If we have found the required character, save the point where we
6914 found it, so that we don't search again next time round the loop if
6915 the start hasn't passed this character yet. */
6916
6917 req_char_ptr = p;
6918 }
6919 }
6920 }
6921
6922#ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6923 printf(">>>> Match against: ");
6924 pchars(start_match, end_subject - start_match, TRUE, md);
6925 printf("\n");
6926#endif
6927
6928 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6929 first starting point for which a partial match was found. */
6930
6931 md->start_match_ptr = start_match;
6932 md->start_used_ptr = start_match;
6933 md->match_call_count = 0;
6934 md->match_function_type = 0;
6935 md->end_offset_top = 0;
6936 md->skip_arg_count = 0;
6937 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6938 if (md->hitend && start_partial == NULL)
6939 {
6940 start_partial = md->start_used_ptr;
6941 match_partial = start_match;
6942 }
6943
6944 switch(rc)
6945 {
6946 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6947 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6948 entirely. The only way we can do that is to re-do the match at the same
6949 point, with a flag to force SKIP with an argument to be ignored. Just
6950 treating this case as NOMATCH does not work because it does not check other
6951 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6952
6953 case MATCH_SKIP_ARG:
6954 new_start_match = start_match;
6955 md->ignore_skip_arg = md->skip_arg_count;
6956 break;
6957
6958 /* SKIP passes back the next starting point explicitly, but if it is no
6959 greater than the match we have just done, treat it as NOMATCH. */
6960
6961 case MATCH_SKIP:
6962 if (md->start_match_ptr > start_match)
6963 {
6964 new_start_match = md->start_match_ptr;
6965 break;
6966 }
6967 /* Fall through */
6968
6969 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6970 exactly like PRUNE. Unset ignore SKIP-with-argument. */
6971
6972 case MATCH_NOMATCH:
6973 case MATCH_PRUNE:
6974 case MATCH_THEN:
6975 md->ignore_skip_arg = 0;
6976 new_start_match = start_match + 1;
6977#ifdef SUPPORT_UTF
6978 if (utf)
6979 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6980 new_start_match++);
6981#endif
6982 break;
6983
6984 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6985
6986 case MATCH_COMMIT:
6987 rc = MATCH_NOMATCH;
6988 goto ENDLOOP;
6989
6990 /* Any other return is either a match, or some kind of error. */
6991
6992 default:
6993 goto ENDLOOP;
6994 }
6995
6996 /* Control reaches here for the various types of "no match at this point"
6997 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6998
6999 rc = MATCH_NOMATCH;
7000
7001 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
7002 newline in the subject (though it may continue over the newline). Therefore,
7003 if we have just failed to match, starting at a newline, do not continue. */
7004
7005 if (firstline && IS_NEWLINE(start_match)) break;
7006
7007 /* Advance to new matching position */
7008
7009 start_match = new_start_match;
7010
7011 /* Break the loop if the pattern is anchored or if we have passed the end of
7012 the subject. */
7013
7014 if (anchored || start_match > end_subject) break;
7015
7016 /* If we have just passed a CR and we are now at a LF, and the pattern does
7017 not contain any explicit matches for \r or \n, and the newline option is CRLF
7018 or ANY or ANYCRLF, advance the match position by one more character. In
7019 normal matching start_match will aways be greater than the first position at
7020 this stage, but a failed *SKIP can cause a return at the same point, which is
7021 why the first test exists. */
7022
7023 if (start_match > (PCRE_PUCHAR)subject + start_offset &&
7024 start_match[-1] == CHAR_CR &&
7025 start_match < end_subject &&
7026 *start_match == CHAR_NL &&
7027 (re->flags & PCRE_HASCRORLF) == 0 &&
7028 (md->nltype == NLTYPE_ANY ||
7029 md->nltype == NLTYPE_ANYCRLF ||
7030 md->nllen == 2))
7031 start_match++;
7032
7033 md->mark = NULL; /* Reset for start of next match attempt */
7034 } /* End of for(;;) "bumpalong" loop */
7035
7036/* ==========================================================================*/
7037
7038/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
7039conditions is true:
7040
7041(1) The pattern is anchored or the match was failed by (*COMMIT);
7042
7043(2) We are past the end of the subject;
7044
7045(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
7046 this option requests that a match occur at or before the first newline in
7047 the subject.
7048
7049When we have a match and the offset vector is big enough to deal with any
7050backreferences, captured substring offsets will already be set up. In the case
7051where we had to get some local store to hold offsets for backreference
7052processing, copy those that we can. In this case there need not be overflow if
7053certain parts of the pattern were not used, even though there are more
7054capturing parentheses than vector slots. */
7055
7056ENDLOOP:
7057
7058if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
7059 {
7060 if (using_temporary_offsets)
7061 {
7062 if (arg_offset_max >= 4)
7063 {
7064 memcpy(offsets + 2, md->offset_vector + 2,
7065 (arg_offset_max - 2) * sizeof(int));
7066 DPRINTF(("Copied offsets from temporary memory\n"));
7067 }
7068 if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT;
7069 DPRINTF(("Freeing temporary memory\n"));
7070 (PUBL(free))(md->offset_vector);
7071 }
7072
7073 /* Set the return code to the number of captured strings, or 0 if there were
7074 too many to fit into the vector. */
7075
7076 rc = ((md->capture_last & OVFLBIT) != 0 &&
7077 md->end_offset_top >= arg_offset_max)?
7078 0 : md->end_offset_top/2;
7079
7080 /* If there is space in the offset vector, set any unused pairs at the end of
7081 the pattern to -1 for backwards compatibility. It is documented that this
7082 happens. In earlier versions, the whole set of potential capturing offsets
7083 was set to -1 each time round the loop, but this is handled differently now.
7084 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
7085 those at the end that need unsetting here. We can't just unset them all at
7086 the start of the whole thing because they may get set in one branch that is
7087 not the final matching branch. */
7088
7089 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
7090 {
7091 register int *iptr, *iend;
7092 int resetcount = 2 + re->top_bracket * 2;
7093 if (resetcount > offsetcount) resetcount = offsetcount;
7094 iptr = offsets + md->end_offset_top;
7095 iend = offsets + resetcount;
7096 while (iptr < iend) *iptr++ = -1;
7097 }
7098
7099 /* If there is space, set up the whole thing as substring 0. The value of
7100 md->start_match_ptr might be modified if \K was encountered on the success
7101 matching path. */
7102
7103 if (offsetcount < 2) rc = 0; else
7104 {
7105 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
7106 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
7107 }
7108
7109 /* Return MARK data if requested */
7110
7111 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7112 *(extra_data->mark) = (pcre_uchar *)md->mark;
7113 DPRINTF((">>>> returning %d\n", rc));
7114#ifdef NO_RECURSE
7115 release_match_heapframes(&frame_zero);
7116#endif
7117 return rc;
7118 }
7119
7120/* Control gets here if there has been an error, or if the overall match
7121attempt has failed at all permitted starting positions. */
7122
7123if (using_temporary_offsets)
7124 {
7125 DPRINTF(("Freeing temporary memory\n"));
7126 (PUBL(free))(md->offset_vector);
7127 }
7128
7129/* For anything other than nomatch or partial match, just return the code. */
7130
7131if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
7132 {
7133 DPRINTF((">>>> error: returning %d\n", rc));
7134#ifdef NO_RECURSE
7135 release_match_heapframes(&frame_zero);
7136#endif
7137 return rc;
7138 }
7139
7140/* Handle partial matches - disable any mark data */
7141
7142if (match_partial != NULL)
7143 {
7144 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
7145 md->mark = NULL;
7146 if (offsetcount > 1)
7147 {
7148 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
7149 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
7150 if (offsetcount > 2)
7151 offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject);
7152 }
7153 rc = PCRE_ERROR_PARTIAL;
7154 }
7155
7156/* This is the classic nomatch case */
7157
7158else
7159 {
7160 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
7161 rc = PCRE_ERROR_NOMATCH;
7162 }
7163
7164/* Return the MARK data if it has been requested. */
7165
7166if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7167 *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark;
7168#ifdef NO_RECURSE
7169 release_match_heapframes(&frame_zero);
7170#endif
7171return rc;
7172}
7173
7174/* End of pcre_exec.c */
7175