1/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2020 University of Cambridge
11
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41
42/* This module contains the external function pcre2_dfa_match(), which is an
43alternative matching function that uses a sort of DFA algorithm (not a true
44FSM). This is NOT Perl-compatible, but it has advantages in certain
45applications. */
46
47
48/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49the performance of his patterns greatly. I could not use it as it stood, as it
50was not thread safe, and made assumptions about pattern sizes. Also, it caused
51test 7 to loop, and test 9 to crash with a segfault.
52
53The issue is the check for duplicate states, which is done by a simple linear
54search up the state list. (Grep for "duplicate" below to find the code.) For
55many patterns, there will never be many states active at one time, so a simple
56linear search is fine. In patterns that have many active states, it might be a
57bottleneck. The suggested code used an indexing scheme to remember which states
58had previously been used for each character, and avoided the linear search when
59it knew there was no chance of a duplicate. This was implemented when adding
60states to the state lists.
61
62I wrote some thread-safe, not-limited code to try something similar at the time
63of checking for duplicates (instead of when adding states), using index vectors
64on the stack. It did give a 13% improvement with one specially constructed
65pattern for certain subject strings, but on other strings and on many of the
66simpler patterns in the test suite it did worse. The major problem, I think,
67was the extra time to initialize the index. This had to be done for each call
68of internal_dfa_match(). (The supplied patch used a static vector, initialized
69only once - I suspect this was the cause of the problems with the tests.)
70
71Overall, I concluded that the gains in some cases did not outweigh the losses
72in others, so I abandoned this code. */
73
74
75#ifdef HAVE_CONFIG_H
76#include "config.h"
77#endif
78
79#define NLBLOCK mb /* Block containing newline information */
80#define PSSTART start_subject /* Field containing processed string start */
81#define PSEND end_subject /* Field containing processed string end */
82
83#include "pcre2_internal.h"
84
85#define PUBLIC_DFA_MATCH_OPTIONS \
86 (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88 PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \
89 PCRE2_COPY_MATCHED_SUBJECT)
90
91
92/*************************************************
93* Code parameters and static tables *
94*************************************************/
95
96/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97into others, under special conditions. A gap of 20 between the blocks should be
98enough. The resulting opcodes don't have to be less than 256 because they are
99never stored, so we push them well clear of the normal opcodes. */
100
101#define OP_PROP_EXTRA 300
102#define OP_EXTUNI_EXTRA 320
103#define OP_ANYNL_EXTRA 340
104#define OP_HSPACE_EXTRA 360
105#define OP_VSPACE_EXTRA 380
106
107
108/* This table identifies those opcodes that are followed immediately by a
109character that is to be tested in some way. This makes it possible to
110centralize the loading of these characters. In the case of Type * etc, the
111"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112small value. Non-zero values in the table are the offsets from the opcode where
113the character is to be found. ***NOTE*** If the start of this table is
114modified, the three tables that follow must also be modified. */
115
116static const uint8_t coptable[] = {
117 0, /* End */
118 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 0, 0, 0, /* Any, AllAny, Anybyte */
121 0, 0, /* \P, \p */
122 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 0, /* \X */
124 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
125 1, /* Char */
126 1, /* Chari */
127 1, /* not */
128 1, /* noti */
129 /* Positive single-char repeats */
130 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
132 1+IMM2_SIZE, /* exact */
133 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
134 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
135 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
136 1+IMM2_SIZE, /* exact I */
137 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
138 /* Negative single-char repeats - only for chars < 256 */
139 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
140 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
141 1+IMM2_SIZE, /* NOT exact */
142 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
143 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
144 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
145 1+IMM2_SIZE, /* NOT exact I */
146 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
147 /* Positive type repeats */
148 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
149 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
150 1+IMM2_SIZE, /* Type exact */
151 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
152 /* Character class & ref repeats */
153 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
154 0, 0, /* CRRANGE, CRMINRANGE */
155 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */
156 0, /* CLASS */
157 0, /* NCLASS */
158 0, /* XCLASS - variable length */
159 0, /* REF */
160 0, /* REFI */
161 0, /* DNREF */
162 0, /* DNREFI */
163 0, /* RECURSE */
164 0, /* CALLOUT */
165 0, /* CALLOUT_STR */
166 0, /* Alt */
167 0, /* Ket */
168 0, /* KetRmax */
169 0, /* KetRmin */
170 0, /* KetRpos */
171 0, /* Reverse */
172 0, /* Assert */
173 0, /* Assert not */
174 0, /* Assert behind */
175 0, /* Assert behind not */
176 0, /* NA assert */
177 0, /* NA assert behind */
178 0, /* ONCE */
179 0, /* SCRIPT_RUN */
180 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
181 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
182 0, 0, /* CREF, DNCREF */
183 0, 0, /* RREF, DNRREF */
184 0, 0, /* FALSE, TRUE */
185 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
186 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
187 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
188 0, 0, /* COMMIT, COMMIT_ARG */
189 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
190 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
191};
192
193/* This table identifies those opcodes that inspect a character. It is used to
194remember the fact that a character could have been inspected when the end of
195the subject is reached. ***NOTE*** If the start of this table is modified, the
196two tables that follow must also be modified. */
197
198static const uint8_t poptable[] = {
199 0, /* End */
200 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
201 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
202 1, 1, 1, /* Any, AllAny, Anybyte */
203 1, 1, /* \P, \p */
204 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
205 1, /* \X */
206 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
207 1, /* Char */
208 1, /* Chari */
209 1, /* not */
210 1, /* noti */
211 /* Positive single-char repeats */
212 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
213 1, 1, 1, /* upto, minupto, exact */
214 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
215 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
216 1, 1, 1, /* upto I, minupto I, exact I */
217 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
218 /* Negative single-char repeats - only for chars < 256 */
219 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
220 1, 1, 1, /* NOT upto, minupto, exact */
221 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
222 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
223 1, 1, 1, /* NOT upto I, minupto I, exact I */
224 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
225 /* Positive type repeats */
226 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
227 1, 1, 1, /* Type upto, minupto, exact */
228 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
229 /* Character class & ref repeats */
230 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
231 1, 1, /* CRRANGE, CRMINRANGE */
232 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */
233 1, /* CLASS */
234 1, /* NCLASS */
235 1, /* XCLASS - variable length */
236 0, /* REF */
237 0, /* REFI */
238 0, /* DNREF */
239 0, /* DNREFI */
240 0, /* RECURSE */
241 0, /* CALLOUT */
242 0, /* CALLOUT_STR */
243 0, /* Alt */
244 0, /* Ket */
245 0, /* KetRmax */
246 0, /* KetRmin */
247 0, /* KetRpos */
248 0, /* Reverse */
249 0, /* Assert */
250 0, /* Assert not */
251 0, /* Assert behind */
252 0, /* Assert behind not */
253 0, /* NA assert */
254 0, /* NA assert behind */
255 0, /* ONCE */
256 0, /* SCRIPT_RUN */
257 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
258 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
259 0, 0, /* CREF, DNCREF */
260 0, 0, /* RREF, DNRREF */
261 0, 0, /* FALSE, TRUE */
262 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
263 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
264 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
265 0, 0, /* COMMIT, COMMIT_ARG */
266 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
267 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
268};
269
270/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
271and \w */
272
273static const uint8_t toptable1[] = {
274 0, 0, 0, 0, 0, 0,
275 ctype_digit, ctype_digit,
276 ctype_space, ctype_space,
277 ctype_word, ctype_word,
278 0, 0 /* OP_ANY, OP_ALLANY */
279};
280
281static const uint8_t toptable2[] = {
282 0, 0, 0, 0, 0, 0,
283 ctype_digit, 0,
284 ctype_space, 0,
285 ctype_word, 0,
286 1, 1 /* OP_ANY, OP_ALLANY */
287};
288
289
290/* Structure for holding data about a particular state, which is in effect the
291current data for an active path through the match tree. It must consist
292entirely of ints because the working vector we are passed, and which we put
293these structures in, is a vector of ints. */
294
295typedef struct stateblock {
296 int offset; /* Offset to opcode (-ve has meaning) */
297 int count; /* Count for repeats */
298 int data; /* Some use extra data */
299} stateblock;
300
301#define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
302
303
304/* Before version 10.32 the recursive calls of internal_dfa_match() were passed
305local working space and output vectors that were created on the stack. This has
306caused issues for some patterns, especially in small-stack environments such as
307Windows. A new scheme is now in use which sets up a vector on the stack, but if
308this is too small, heap memory is used, up to the heap_limit. The main
309parameters are all numbers of ints because the workspace is a vector of ints.
310
311The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
312defined in pcre2_internal.h so as to be available to pcre2test when it is
313finding the minimum heap requirement for a match. */
314
315#define OVEC_UNIT (sizeof(PCRE2_SIZE)/sizeof(int))
316
317#define RWS_BASE_SIZE (DFA_START_RWS_SIZE/sizeof(int)) /* Stack vector */
318#define RWS_RSIZE 1000 /* Work size for recursion */
319#define RWS_OVEC_RSIZE (1000*OVEC_UNIT) /* Ovector for recursion */
320#define RWS_OVEC_OSIZE (2*OVEC_UNIT) /* Ovector in other cases */
321
322/* This structure is at the start of each workspace block. */
323
324typedef struct RWS_anchor {
325 struct RWS_anchor *next;
326 uint32_t size; /* Number of ints */
327 uint32_t free; /* Number of ints */
328} RWS_anchor;
329
330#define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
331
332
333
334/*************************************************
335* Process a callout *
336*************************************************/
337
338/* This function is called to perform a callout.
339
340Arguments:
341 code current code pointer
342 offsets points to current capture offsets
343 current_subject start of current subject match
344 ptr current position in subject
345 mb the match block
346 extracode extra code offset when called from condition
347 lengthptr where to return the callout length
348
349Returns: the return from the callout
350*/
351
352static int
353do_callout(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
354 PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
355 PCRE2_SIZE *lengthptr)
356{
357pcre2_callout_block *cb = mb->cb;
358
359*lengthptr = (code[extracode] == OP_CALLOUT)?
360 (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
361 (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
362
363if (mb->callout == NULL) return 0; /* No callout provided */
364
365/* Fixed fields in the callout block are set once and for all at the start of
366matching. */
367
368cb->offset_vector = offsets;
369cb->start_match = (PCRE2_SIZE)(current_subject - mb->start_subject);
370cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
371cb->pattern_position = GET(code, 1 + extracode);
372cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
373
374if (code[extracode] == OP_CALLOUT)
375 {
376 cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
377 cb->callout_string_offset = 0;
378 cb->callout_string = NULL;
379 cb->callout_string_length = 0;
380 }
381else
382 {
383 cb->callout_number = 0;
384 cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
385 cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
386 cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
387 }
388
389return (mb->callout)(cb, mb->callout_data);
390}
391
392
393
394/*************************************************
395* Expand local workspace memory *
396*************************************************/
397
398/* This function is called when internal_dfa_match() is about to be called
399recursively and there is insufficient working space left in the current
400workspace block. If there's an existing next block, use it; otherwise get a new
401block unless the heap limit is reached.
402
403Arguments:
404 rwsptr pointer to block pointer (updated)
405 ovecsize space needed for an ovector
406 mb the match block
407
408Returns: 0 rwsptr has been updated
409 !0 an error code
410*/
411
412static int
413more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
414{
415RWS_anchor *rws = *rwsptr;
416RWS_anchor *new;
417
418if (rws->next != NULL)
419 {
420 new = rws->next;
421 }
422
423/* Sizes in the RWS_anchor blocks are in units of sizeof(int), but
424mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid
425overflow. */
426
427else
428 {
429 uint32_t newsize = (rws->size >= UINT32_MAX/2)? UINT32_MAX/2 : rws->size * 2;
430 uint32_t newsizeK = newsize/(1024/sizeof(int));
431
432 if (newsizeK + mb->heap_used > mb->heap_limit)
433 newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used);
434 newsize = newsizeK*(1024/sizeof(int));
435
436 if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
437 return PCRE2_ERROR_HEAPLIMIT;
438 new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
439 if (new == NULL) return PCRE2_ERROR_NOMEMORY;
440 mb->heap_used += newsizeK;
441 new->next = NULL;
442 new->size = newsize;
443 rws->next = new;
444 }
445
446new->free = new->size - RWS_ANCHOR_SIZE;
447*rwsptr = new;
448return 0;
449}
450
451
452
453/*************************************************
454* Match a Regular Expression - DFA engine *
455*************************************************/
456
457/* This internal function applies a compiled pattern to a subject string,
458starting at a given point, using a DFA engine. This function is called from the
459external one, possibly multiple times if the pattern is not anchored. The
460function calls itself recursively for some kinds of subpattern.
461
462Arguments:
463 mb the match_data block with fixed information
464 this_start_code the opening bracket of this subexpression's code
465 current_subject where we currently are in the subject string
466 start_offset start offset in the subject string
467 offsets vector to contain the matching string offsets
468 offsetcount size of same
469 workspace vector of workspace
470 wscount size of same
471 rlevel function call recursion level
472
473Returns: > 0 => number of match offset pairs placed in offsets
474 = 0 => offsets overflowed; longest matches are present
475 -1 => failed to match
476 < -1 => some kind of unexpected problem
477
478The following macros are used for adding states to the two state vectors (one
479for the current character, one for the following character). */
480
481#define ADD_ACTIVE(x,y) \
482 if (active_count++ < wscount) \
483 { \
484 next_active_state->offset = (x); \
485 next_active_state->count = (y); \
486 next_active_state++; \
487 } \
488 else return PCRE2_ERROR_DFA_WSSIZE
489
490#define ADD_ACTIVE_DATA(x,y,z) \
491 if (active_count++ < wscount) \
492 { \
493 next_active_state->offset = (x); \
494 next_active_state->count = (y); \
495 next_active_state->data = (z); \
496 next_active_state++; \
497 } \
498 else return PCRE2_ERROR_DFA_WSSIZE
499
500#define ADD_NEW(x,y) \
501 if (new_count++ < wscount) \
502 { \
503 next_new_state->offset = (x); \
504 next_new_state->count = (y); \
505 next_new_state++; \
506 } \
507 else return PCRE2_ERROR_DFA_WSSIZE
508
509#define ADD_NEW_DATA(x,y,z) \
510 if (new_count++ < wscount) \
511 { \
512 next_new_state->offset = (x); \
513 next_new_state->count = (y); \
514 next_new_state->data = (z); \
515 next_new_state++; \
516 } \
517 else return PCRE2_ERROR_DFA_WSSIZE
518
519/* And now, here is the code */
520
521static int
522internal_dfa_match(
523 dfa_match_block *mb,
524 PCRE2_SPTR this_start_code,
525 PCRE2_SPTR current_subject,
526 PCRE2_SIZE start_offset,
527 PCRE2_SIZE *offsets,
528 uint32_t offsetcount,
529 int *workspace,
530 int wscount,
531 uint32_t rlevel,
532 int *RWS)
533{
534stateblock *active_states, *new_states, *temp_states;
535stateblock *next_active_state, *next_new_state;
536const uint8_t *ctypes, *lcc, *fcc;
537PCRE2_SPTR ptr;
538PCRE2_SPTR end_code;
539dfa_recursion_info new_recursive;
540int active_count, new_count, match_count;
541
542/* Some fields in the mb block are frequently referenced, so we load them into
543independent variables in the hope that this will perform better. */
544
545PCRE2_SPTR start_subject = mb->start_subject;
546PCRE2_SPTR end_subject = mb->end_subject;
547PCRE2_SPTR start_code = mb->start_code;
548
549#ifdef SUPPORT_UNICODE
550BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
551BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
552#else
553BOOL utf = FALSE;
554#endif
555
556BOOL reset_could_continue = FALSE;
557
558if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
559if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
560offsetcount &= (uint32_t)(-2); /* Round down */
561
562wscount -= 2;
563wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
564 (2 * INTS_PER_STATEBLOCK);
565
566ctypes = mb->tables + ctypes_offset;
567lcc = mb->tables + lcc_offset;
568fcc = mb->tables + fcc_offset;
569
570match_count = PCRE2_ERROR_NOMATCH; /* A negative number */
571
572active_states = (stateblock *)(workspace + 2);
573next_new_state = new_states = active_states + wscount;
574new_count = 0;
575
576/* The first thing in any (sub) pattern is a bracket of some sort. Push all
577the alternative states onto the list, and find out where the end is. This
578makes is possible to use this function recursively, when we want to stop at a
579matching internal ket rather than at the end.
580
581If we are dealing with a backward assertion we have to find out the maximum
582amount to move back, and set up each alternative appropriately. */
583
584if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
585 {
586 size_t max_back = 0;
587 size_t gone_back;
588
589 end_code = this_start_code;
590 do
591 {
592 size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
593 if (back > max_back) max_back = back;
594 end_code += GET(end_code, 1);
595 }
596 while (*end_code == OP_ALT);
597
598 /* If we can't go back the amount required for the longest lookbehind
599 pattern, go back as far as we can; some alternatives may still be viable. */
600
601#ifdef SUPPORT_UNICODE
602 /* In character mode we have to step back character by character */
603
604 if (utf)
605 {
606 for (gone_back = 0; gone_back < max_back; gone_back++)
607 {
608 if (current_subject <= start_subject) break;
609 current_subject--;
610 ACROSSCHAR(current_subject > start_subject, current_subject,
611 current_subject--);
612 }
613 }
614 else
615#endif
616
617 /* In byte-mode we can do this quickly. */
618
619 {
620 size_t current_offset = (size_t)(current_subject - start_subject);
621 gone_back = (current_offset < max_back)? current_offset : max_back;
622 current_subject -= gone_back;
623 }
624
625 /* Save the earliest consulted character */
626
627 if (current_subject < mb->start_used_ptr)
628 mb->start_used_ptr = current_subject;
629
630 /* Now we can process the individual branches. There will be an OP_REVERSE at
631 the start of each branch, except when the length of the branch is zero. */
632
633 end_code = this_start_code;
634 do
635 {
636 uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0;
637 size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE);
638 if (back <= gone_back)
639 {
640 int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
641 ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
642 }
643 end_code += GET(end_code, 1);
644 }
645 while (*end_code == OP_ALT);
646 }
647
648/* This is the code for a "normal" subpattern (not a backward assertion). The
649start of a whole pattern is always one of these. If we are at the top level,
650we may be asked to restart matching from the same point that we reached for a
651previous partial match. We still have to scan through the top-level branches to
652find the end state. */
653
654else
655 {
656 end_code = this_start_code;
657
658 /* Restarting */
659
660 if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
661 {
662 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
663 new_count = workspace[1];
664 if (!workspace[0])
665 memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
666 }
667
668 /* Not restarting */
669
670 else
671 {
672 int length = 1 + LINK_SIZE +
673 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
674 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
675 ? IMM2_SIZE:0);
676 do
677 {
678 ADD_NEW((int)(end_code - start_code + length), 0);
679 end_code += GET(end_code, 1);
680 length = 1 + LINK_SIZE;
681 }
682 while (*end_code == OP_ALT);
683 }
684 }
685
686workspace[0] = 0; /* Bit indicating which vector is current */
687
688/* Loop for scanning the subject */
689
690ptr = current_subject;
691for (;;)
692 {
693 int i, j;
694 int clen, dlen;
695 uint32_t c, d;
696 int forced_fail = 0;
697 BOOL partial_newline = FALSE;
698 BOOL could_continue = reset_could_continue;
699 reset_could_continue = FALSE;
700
701 if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
702
703 /* Make the new state list into the active state list and empty the
704 new state list. */
705
706 temp_states = active_states;
707 active_states = new_states;
708 new_states = temp_states;
709 active_count = new_count;
710 new_count = 0;
711
712 workspace[0] ^= 1; /* Remember for the restarting feature */
713 workspace[1] = active_count;
714
715 /* Set the pointers for adding new states */
716
717 next_active_state = active_states + active_count;
718 next_new_state = new_states;
719
720 /* Load the current character from the subject outside the loop, as many
721 different states may want to look at it, and we assume that at least one
722 will. */
723
724 if (ptr < end_subject)
725 {
726 clen = 1; /* Number of data items in the character */
727#ifdef SUPPORT_UNICODE
728 GETCHARLENTEST(c, ptr, clen);
729#else
730 c = *ptr;
731#endif /* SUPPORT_UNICODE */
732 }
733 else
734 {
735 clen = 0; /* This indicates the end of the subject */
736 c = NOTACHAR; /* This value should never actually be used */
737 }
738
739 /* Scan up the active states and act on each one. The result of an action
740 may be to add more states to the currently active list (e.g. on hitting a
741 parenthesis) or it may be to put states on the new list, for considering
742 when we move the character pointer on. */
743
744 for (i = 0; i < active_count; i++)
745 {
746 stateblock *current_state = active_states + i;
747 BOOL caseless = FALSE;
748 PCRE2_SPTR code;
749 uint32_t codevalue;
750 int state_offset = current_state->offset;
751 int rrc;
752 int count;
753
754 /* A negative offset is a special case meaning "hold off going to this
755 (negated) state until the number of characters in the data field have
756 been skipped". If the could_continue flag was passed over from a previous
757 state, arrange for it to passed on. */
758
759 if (state_offset < 0)
760 {
761 if (current_state->data > 0)
762 {
763 ADD_NEW_DATA(state_offset, current_state->count,
764 current_state->data - 1);
765 if (could_continue) reset_could_continue = TRUE;
766 continue;
767 }
768 else
769 {
770 current_state->offset = state_offset = -state_offset;
771 }
772 }
773
774 /* Check for a duplicate state with the same count, and skip if found.
775 See the note at the head of this module about the possibility of improving
776 performance here. */
777
778 for (j = 0; j < i; j++)
779 {
780 if (active_states[j].offset == state_offset &&
781 active_states[j].count == current_state->count)
782 goto NEXT_ACTIVE_STATE;
783 }
784
785 /* The state offset is the offset to the opcode */
786
787 code = start_code + state_offset;
788 codevalue = *code;
789
790 /* If this opcode inspects a character, but we are at the end of the
791 subject, remember the fact for use when testing for a partial match. */
792
793 if (clen == 0 && poptable[codevalue] != 0)
794 could_continue = TRUE;
795
796 /* If this opcode is followed by an inline character, load it. It is
797 tempting to test for the presence of a subject character here, but that
798 is wrong, because sometimes zero repetitions of the subject are
799 permitted.
800
801 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
802 argument that is not a data character - but is always one byte long because
803 the values are small. We have to take special action to deal with \P, \p,
804 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
805 these ones to new opcodes. */
806
807 if (coptable[codevalue] > 0)
808 {
809 dlen = 1;
810#ifdef SUPPORT_UNICODE
811 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
812#endif /* SUPPORT_UNICODE */
813 d = code[coptable[codevalue]];
814 if (codevalue >= OP_TYPESTAR)
815 {
816 switch(d)
817 {
818 case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
819 case OP_NOTPROP:
820 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
821 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
822 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
823 case OP_NOT_HSPACE:
824 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
825 case OP_NOT_VSPACE:
826 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
827 default: break;
828 }
829 }
830 }
831 else
832 {
833 dlen = 0; /* Not strictly necessary, but compilers moan */
834 d = NOTACHAR; /* if these variables are not set. */
835 }
836
837
838 /* Now process the individual opcodes */
839
840 switch (codevalue)
841 {
842/* ========================================================================== */
843 /* These cases are never obeyed. This is a fudge that causes a compile-
844 time error if the vectors coptable or poptable, which are indexed by
845 opcode, are not the correct length. It seems to be the only way to do
846 such a check at compile time, as the sizeof() operator does not work
847 in the C preprocessor. */
848
849 case OP_TABLE_LENGTH:
850 case OP_TABLE_LENGTH +
851 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
852 (sizeof(poptable) == OP_TABLE_LENGTH)):
853 return 0;
854
855/* ========================================================================== */
856 /* Reached a closing bracket. If not at the end of the pattern, carry
857 on with the next opcode. For repeating opcodes, also add the repeat
858 state. Note that KETRPOS will always be encountered at the end of the
859 subpattern, because the possessive subpattern repeats are always handled
860 using recursive calls. Thus, it never adds any new states.
861
862 At the end of the (sub)pattern, unless we have an empty string and
863 PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
864 start of the subject, save the match data, shifting up all previous
865 matches so we always have the longest first. */
866
867 case OP_KET:
868 case OP_KETRMIN:
869 case OP_KETRMAX:
870 case OP_KETRPOS:
871 if (code != end_code)
872 {
873 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
874 if (codevalue != OP_KET)
875 {
876 ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
877 }
878 }
879 else
880 {
881 if (ptr > current_subject ||
882 ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
883 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
884 current_subject > start_subject + mb->start_offset)))
885 {
886 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
887 else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
888 match_count = 0;
889 count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
890 if (count > 0) (void)memmove(offsets + 2, offsets,
891 (size_t)count * sizeof(PCRE2_SIZE));
892 if (offsetcount >= 2)
893 {
894 offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
895 offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
896 }
897 if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
898 }
899 }
900 break;
901
902/* ========================================================================== */
903 /* These opcodes add to the current list of states without looking
904 at the current character. */
905
906 /*-----------------------------------------------------------------*/
907 case OP_ALT:
908 do { code += GET(code, 1); } while (*code == OP_ALT);
909 ADD_ACTIVE((int)(code - start_code), 0);
910 break;
911
912 /*-----------------------------------------------------------------*/
913 case OP_BRA:
914 case OP_SBRA:
915 do
916 {
917 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
918 code += GET(code, 1);
919 }
920 while (*code == OP_ALT);
921 break;
922
923 /*-----------------------------------------------------------------*/
924 case OP_CBRA:
925 case OP_SCBRA:
926 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
927 code += GET(code, 1);
928 while (*code == OP_ALT)
929 {
930 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
931 code += GET(code, 1);
932 }
933 break;
934
935 /*-----------------------------------------------------------------*/
936 case OP_BRAZERO:
937 case OP_BRAMINZERO:
938 ADD_ACTIVE(state_offset + 1, 0);
939 code += 1 + GET(code, 2);
940 while (*code == OP_ALT) code += GET(code, 1);
941 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
942 break;
943
944 /*-----------------------------------------------------------------*/
945 case OP_SKIPZERO:
946 code += 1 + GET(code, 2);
947 while (*code == OP_ALT) code += GET(code, 1);
948 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
949 break;
950
951 /*-----------------------------------------------------------------*/
952 case OP_CIRC:
953 if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
954 { ADD_ACTIVE(state_offset + 1, 0); }
955 break;
956
957 /*-----------------------------------------------------------------*/
958 case OP_CIRCM:
959 if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
960 ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
961 && WAS_NEWLINE(ptr)))
962 { ADD_ACTIVE(state_offset + 1, 0); }
963 break;
964
965 /*-----------------------------------------------------------------*/
966 case OP_EOD:
967 if (ptr >= end_subject)
968 {
969 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
970 return PCRE2_ERROR_PARTIAL;
971 else { ADD_ACTIVE(state_offset + 1, 0); }
972 }
973 break;
974
975 /*-----------------------------------------------------------------*/
976 case OP_SOD:
977 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
978 break;
979
980 /*-----------------------------------------------------------------*/
981 case OP_SOM:
982 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
983 break;
984
985
986/* ========================================================================== */
987 /* These opcodes inspect the next subject character, and sometimes
988 the previous one as well, but do not have an argument. The variable
989 clen contains the length of the current character and is zero if we are
990 at the end of the subject. */
991
992 /*-----------------------------------------------------------------*/
993 case OP_ANY:
994 if (clen > 0 && !IS_NEWLINE(ptr))
995 {
996 if (ptr + 1 >= mb->end_subject &&
997 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
998 NLBLOCK->nltype == NLTYPE_FIXED &&
999 NLBLOCK->nllen == 2 &&
1000 c == NLBLOCK->nl[0])
1001 {
1002 could_continue = partial_newline = TRUE;
1003 }
1004 else
1005 {
1006 ADD_NEW(state_offset + 1, 0);
1007 }
1008 }
1009 break;
1010
1011 /*-----------------------------------------------------------------*/
1012 case OP_ALLANY:
1013 if (clen > 0)
1014 { ADD_NEW(state_offset + 1, 0); }
1015 break;
1016
1017 /*-----------------------------------------------------------------*/
1018 case OP_EODN:
1019 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1020 {
1021 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1022 return PCRE2_ERROR_PARTIAL;
1023 ADD_ACTIVE(state_offset + 1, 0);
1024 }
1025 break;
1026
1027 /*-----------------------------------------------------------------*/
1028 case OP_DOLL:
1029 if ((mb->moptions & PCRE2_NOTEOL) == 0)
1030 {
1031 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1032 could_continue = TRUE;
1033 else if (clen == 0 ||
1034 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1035 (ptr == end_subject - mb->nllen)
1036 ))
1037 { ADD_ACTIVE(state_offset + 1, 0); }
1038 else if (ptr + 1 >= mb->end_subject &&
1039 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1040 NLBLOCK->nltype == NLTYPE_FIXED &&
1041 NLBLOCK->nllen == 2 &&
1042 c == NLBLOCK->nl[0])
1043 {
1044 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1045 {
1046 reset_could_continue = TRUE;
1047 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1048 }
1049 else could_continue = partial_newline = TRUE;
1050 }
1051 }
1052 break;
1053
1054 /*-----------------------------------------------------------------*/
1055 case OP_DOLLM:
1056 if ((mb->moptions & PCRE2_NOTEOL) == 0)
1057 {
1058 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1059 could_continue = TRUE;
1060 else if (clen == 0 ||
1061 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1062 { ADD_ACTIVE(state_offset + 1, 0); }
1063 else if (ptr + 1 >= mb->end_subject &&
1064 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1065 NLBLOCK->nltype == NLTYPE_FIXED &&
1066 NLBLOCK->nllen == 2 &&
1067 c == NLBLOCK->nl[0])
1068 {
1069 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1070 {
1071 reset_could_continue = TRUE;
1072 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1073 }
1074 else could_continue = partial_newline = TRUE;
1075 }
1076 }
1077 else if (IS_NEWLINE(ptr))
1078 { ADD_ACTIVE(state_offset + 1, 0); }
1079 break;
1080
1081 /*-----------------------------------------------------------------*/
1082
1083 case OP_DIGIT:
1084 case OP_WHITESPACE:
1085 case OP_WORDCHAR:
1086 if (clen > 0 && c < 256 &&
1087 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1088 { ADD_NEW(state_offset + 1, 0); }
1089 break;
1090
1091 /*-----------------------------------------------------------------*/
1092 case OP_NOT_DIGIT:
1093 case OP_NOT_WHITESPACE:
1094 case OP_NOT_WORDCHAR:
1095 if (clen > 0 && (c >= 256 ||
1096 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1097 { ADD_NEW(state_offset + 1, 0); }
1098 break;
1099
1100 /*-----------------------------------------------------------------*/
1101 case OP_WORD_BOUNDARY:
1102 case OP_NOT_WORD_BOUNDARY:
1103 {
1104 int left_word, right_word;
1105
1106 if (ptr > start_subject)
1107 {
1108 PCRE2_SPTR temp = ptr - 1;
1109 if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1110#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1111 if (utf) { BACKCHAR(temp); }
1112#endif
1113 GETCHARTEST(d, temp);
1114#ifdef SUPPORT_UNICODE
1115 if ((mb->poptions & PCRE2_UCP) != 0)
1116 {
1117 if (d == '_') left_word = TRUE; else
1118 {
1119 uint32_t cat = UCD_CATEGORY(d);
1120 left_word = (cat == ucp_L || cat == ucp_N);
1121 }
1122 }
1123 else
1124#endif
1125 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1126 }
1127 else left_word = FALSE;
1128
1129 if (clen > 0)
1130 {
1131 if (ptr >= mb->last_used_ptr)
1132 {
1133 PCRE2_SPTR temp = ptr + 1;
1134#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1135 if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1136#endif
1137 mb->last_used_ptr = temp;
1138 }
1139#ifdef SUPPORT_UNICODE
1140 if ((mb->poptions & PCRE2_UCP) != 0)
1141 {
1142 if (c == '_') right_word = TRUE; else
1143 {
1144 uint32_t cat = UCD_CATEGORY(c);
1145 right_word = (cat == ucp_L || cat == ucp_N);
1146 }
1147 }
1148 else
1149#endif
1150 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1151 }
1152 else right_word = FALSE;
1153
1154 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1155 { ADD_ACTIVE(state_offset + 1, 0); }
1156 }
1157 break;
1158
1159
1160 /*-----------------------------------------------------------------*/
1161 /* Check the next character by Unicode property. We will get here only
1162 if the support is in the binary; otherwise a compile-time error occurs.
1163 */
1164
1165#ifdef SUPPORT_UNICODE
1166 case OP_PROP:
1167 case OP_NOTPROP:
1168 if (clen > 0)
1169 {
1170 BOOL OK;
1171 const uint32_t *cp;
1172 const ucd_record * prop = GET_UCD(c);
1173 switch(code[1])
1174 {
1175 case PT_ANY:
1176 OK = TRUE;
1177 break;
1178
1179 case PT_LAMP:
1180 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1181 prop->chartype == ucp_Lt;
1182 break;
1183
1184 case PT_GC:
1185 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1186 break;
1187
1188 case PT_PC:
1189 OK = prop->chartype == code[2];
1190 break;
1191
1192 case PT_SC:
1193 OK = prop->script == code[2];
1194 break;
1195
1196 /* These are specials for combination cases. */
1197
1198 case PT_ALNUM:
1199 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1200 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1201 break;
1202
1203 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1204 which means that Perl space and POSIX space are now identical. PCRE
1205 was changed at release 8.34. */
1206
1207 case PT_SPACE: /* Perl space */
1208 case PT_PXSPACE: /* POSIX space */
1209 switch(c)
1210 {
1211 HSPACE_CASES:
1212 VSPACE_CASES:
1213 OK = TRUE;
1214 break;
1215
1216 default:
1217 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1218 break;
1219 }
1220 break;
1221
1222 case PT_WORD:
1223 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1224 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1225 c == CHAR_UNDERSCORE;
1226 break;
1227
1228 case PT_CLIST:
1229 cp = PRIV(ucd_caseless_sets) + code[2];
1230 for (;;)
1231 {
1232 if (c < *cp) { OK = FALSE; break; }
1233 if (c == *cp++) { OK = TRUE; break; }
1234 }
1235 break;
1236
1237 case PT_UCNC:
1238 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1239 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1240 c >= 0xe000;
1241 break;
1242
1243 /* Should never occur, but keep compilers from grumbling. */
1244
1245 default:
1246 OK = codevalue != OP_PROP;
1247 break;
1248 }
1249
1250 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1251 }
1252 break;
1253#endif
1254
1255
1256
1257/* ========================================================================== */
1258 /* These opcodes likewise inspect the subject character, but have an
1259 argument that is not a data character. It is one of these opcodes:
1260 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1261 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1262
1263 case OP_TYPEPLUS:
1264 case OP_TYPEMINPLUS:
1265 case OP_TYPEPOSPLUS:
1266 count = current_state->count; /* Already matched */
1267 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1268 if (clen > 0)
1269 {
1270 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1271 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1272 NLBLOCK->nltype == NLTYPE_FIXED &&
1273 NLBLOCK->nllen == 2 &&
1274 c == NLBLOCK->nl[0])
1275 {
1276 could_continue = partial_newline = TRUE;
1277 }
1278 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1279 (c < 256 &&
1280 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1281 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1282 {
1283 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1284 {
1285 active_count--; /* Remove non-match possibility */
1286 next_active_state--;
1287 }
1288 count++;
1289 ADD_NEW(state_offset, count);
1290 }
1291 }
1292 break;
1293
1294 /*-----------------------------------------------------------------*/
1295 case OP_TYPEQUERY:
1296 case OP_TYPEMINQUERY:
1297 case OP_TYPEPOSQUERY:
1298 ADD_ACTIVE(state_offset + 2, 0);
1299 if (clen > 0)
1300 {
1301 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1302 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1303 NLBLOCK->nltype == NLTYPE_FIXED &&
1304 NLBLOCK->nllen == 2 &&
1305 c == NLBLOCK->nl[0])
1306 {
1307 could_continue = partial_newline = TRUE;
1308 }
1309 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1310 (c < 256 &&
1311 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1312 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1313 {
1314 if (codevalue == OP_TYPEPOSQUERY)
1315 {
1316 active_count--; /* Remove non-match possibility */
1317 next_active_state--;
1318 }
1319 ADD_NEW(state_offset + 2, 0);
1320 }
1321 }
1322 break;
1323
1324 /*-----------------------------------------------------------------*/
1325 case OP_TYPESTAR:
1326 case OP_TYPEMINSTAR:
1327 case OP_TYPEPOSSTAR:
1328 ADD_ACTIVE(state_offset + 2, 0);
1329 if (clen > 0)
1330 {
1331 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1332 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1333 NLBLOCK->nltype == NLTYPE_FIXED &&
1334 NLBLOCK->nllen == 2 &&
1335 c == NLBLOCK->nl[0])
1336 {
1337 could_continue = partial_newline = TRUE;
1338 }
1339 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1340 (c < 256 &&
1341 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1342 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1343 {
1344 if (codevalue == OP_TYPEPOSSTAR)
1345 {
1346 active_count--; /* Remove non-match possibility */
1347 next_active_state--;
1348 }
1349 ADD_NEW(state_offset, 0);
1350 }
1351 }
1352 break;
1353
1354 /*-----------------------------------------------------------------*/
1355 case OP_TYPEEXACT:
1356 count = current_state->count; /* Number already matched */
1357 if (clen > 0)
1358 {
1359 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1360 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1361 NLBLOCK->nltype == NLTYPE_FIXED &&
1362 NLBLOCK->nllen == 2 &&
1363 c == NLBLOCK->nl[0])
1364 {
1365 could_continue = partial_newline = TRUE;
1366 }
1367 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1368 (c < 256 &&
1369 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1370 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1371 {
1372 if (++count >= (int)GET2(code, 1))
1373 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1374 else
1375 { ADD_NEW(state_offset, count); }
1376 }
1377 }
1378 break;
1379
1380 /*-----------------------------------------------------------------*/
1381 case OP_TYPEUPTO:
1382 case OP_TYPEMINUPTO:
1383 case OP_TYPEPOSUPTO:
1384 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1385 count = current_state->count; /* Number already matched */
1386 if (clen > 0)
1387 {
1388 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1389 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1390 NLBLOCK->nltype == NLTYPE_FIXED &&
1391 NLBLOCK->nllen == 2 &&
1392 c == NLBLOCK->nl[0])
1393 {
1394 could_continue = partial_newline = TRUE;
1395 }
1396 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1397 (c < 256 &&
1398 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1399 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1400 {
1401 if (codevalue == OP_TYPEPOSUPTO)
1402 {
1403 active_count--; /* Remove non-match possibility */
1404 next_active_state--;
1405 }
1406 if (++count >= (int)GET2(code, 1))
1407 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1408 else
1409 { ADD_NEW(state_offset, count); }
1410 }
1411 }
1412 break;
1413
1414/* ========================================================================== */
1415 /* These are virtual opcodes that are used when something like
1416 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1417 argument. It keeps the code above fast for the other cases. The argument
1418 is in the d variable. */
1419
1420#ifdef SUPPORT_UNICODE
1421 case OP_PROP_EXTRA + OP_TYPEPLUS:
1422 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1423 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1424 count = current_state->count; /* Already matched */
1425 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1426 if (clen > 0)
1427 {
1428 BOOL OK;
1429 const uint32_t *cp;
1430 const ucd_record * prop = GET_UCD(c);
1431 switch(code[2])
1432 {
1433 case PT_ANY:
1434 OK = TRUE;
1435 break;
1436
1437 case PT_LAMP:
1438 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1439 prop->chartype == ucp_Lt;
1440 break;
1441
1442 case PT_GC:
1443 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1444 break;
1445
1446 case PT_PC:
1447 OK = prop->chartype == code[3];
1448 break;
1449
1450 case PT_SC:
1451 OK = prop->script == code[3];
1452 break;
1453
1454 /* These are specials for combination cases. */
1455
1456 case PT_ALNUM:
1457 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1458 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1459 break;
1460
1461 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1462 which means that Perl space and POSIX space are now identical. PCRE
1463 was changed at release 8.34. */
1464
1465 case PT_SPACE: /* Perl space */
1466 case PT_PXSPACE: /* POSIX space */
1467 switch(c)
1468 {
1469 HSPACE_CASES:
1470 VSPACE_CASES:
1471 OK = TRUE;
1472 break;
1473
1474 default:
1475 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1476 break;
1477 }
1478 break;
1479
1480 case PT_WORD:
1481 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1482 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1483 c == CHAR_UNDERSCORE;
1484 break;
1485
1486 case PT_CLIST:
1487 cp = PRIV(ucd_caseless_sets) + code[3];
1488 for (;;)
1489 {
1490 if (c < *cp) { OK = FALSE; break; }
1491 if (c == *cp++) { OK = TRUE; break; }
1492 }
1493 break;
1494
1495 case PT_UCNC:
1496 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1497 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1498 c >= 0xe000;
1499 break;
1500
1501 /* Should never occur, but keep compilers from grumbling. */
1502
1503 default:
1504 OK = codevalue != OP_PROP;
1505 break;
1506 }
1507
1508 if (OK == (d == OP_PROP))
1509 {
1510 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1511 {
1512 active_count--; /* Remove non-match possibility */
1513 next_active_state--;
1514 }
1515 count++;
1516 ADD_NEW(state_offset, count);
1517 }
1518 }
1519 break;
1520
1521 /*-----------------------------------------------------------------*/
1522 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1523 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1524 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1525 count = current_state->count; /* Already matched */
1526 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1527 if (clen > 0)
1528 {
1529 int ncount = 0;
1530 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1531 {
1532 active_count--; /* Remove non-match possibility */
1533 next_active_state--;
1534 }
1535 (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1536 &ncount);
1537 count++;
1538 ADD_NEW_DATA(-state_offset, count, ncount);
1539 }
1540 break;
1541#endif
1542
1543 /*-----------------------------------------------------------------*/
1544 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1545 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1546 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1547 count = current_state->count; /* Already matched */
1548 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1549 if (clen > 0)
1550 {
1551 int ncount = 0;
1552 switch (c)
1553 {
1554 case CHAR_VT:
1555 case CHAR_FF:
1556 case CHAR_NEL:
1557#ifndef EBCDIC
1558 case 0x2028:
1559 case 0x2029:
1560#endif /* Not EBCDIC */
1561 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1562 goto ANYNL01;
1563
1564 case CHAR_CR:
1565 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1566 /* Fall through */
1567
1568 ANYNL01:
1569 case CHAR_LF:
1570 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1571 {
1572 active_count--; /* Remove non-match possibility */
1573 next_active_state--;
1574 }
1575 count++;
1576 ADD_NEW_DATA(-state_offset, count, ncount);
1577 break;
1578
1579 default:
1580 break;
1581 }
1582 }
1583 break;
1584
1585 /*-----------------------------------------------------------------*/
1586 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1587 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1588 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1589 count = current_state->count; /* Already matched */
1590 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1591 if (clen > 0)
1592 {
1593 BOOL OK;
1594 switch (c)
1595 {
1596 VSPACE_CASES:
1597 OK = TRUE;
1598 break;
1599
1600 default:
1601 OK = FALSE;
1602 break;
1603 }
1604
1605 if (OK == (d == OP_VSPACE))
1606 {
1607 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1608 {
1609 active_count--; /* Remove non-match possibility */
1610 next_active_state--;
1611 }
1612 count++;
1613 ADD_NEW_DATA(-state_offset, count, 0);
1614 }
1615 }
1616 break;
1617
1618 /*-----------------------------------------------------------------*/
1619 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1620 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1621 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1622 count = current_state->count; /* Already matched */
1623 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1624 if (clen > 0)
1625 {
1626 BOOL OK;
1627 switch (c)
1628 {
1629 HSPACE_CASES:
1630 OK = TRUE;
1631 break;
1632
1633 default:
1634 OK = FALSE;
1635 break;
1636 }
1637
1638 if (OK == (d == OP_HSPACE))
1639 {
1640 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1641 {
1642 active_count--; /* Remove non-match possibility */
1643 next_active_state--;
1644 }
1645 count++;
1646 ADD_NEW_DATA(-state_offset, count, 0);
1647 }
1648 }
1649 break;
1650
1651 /*-----------------------------------------------------------------*/
1652#ifdef SUPPORT_UNICODE
1653 case OP_PROP_EXTRA + OP_TYPEQUERY:
1654 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1655 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1656 count = 4;
1657 goto QS1;
1658
1659 case OP_PROP_EXTRA + OP_TYPESTAR:
1660 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1661 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1662 count = 0;
1663
1664 QS1:
1665
1666 ADD_ACTIVE(state_offset + 4, 0);
1667 if (clen > 0)
1668 {
1669 BOOL OK;
1670 const uint32_t *cp;
1671 const ucd_record * prop = GET_UCD(c);
1672 switch(code[2])
1673 {
1674 case PT_ANY:
1675 OK = TRUE;
1676 break;
1677
1678 case PT_LAMP:
1679 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1680 prop->chartype == ucp_Lt;
1681 break;
1682
1683 case PT_GC:
1684 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1685 break;
1686
1687 case PT_PC:
1688 OK = prop->chartype == code[3];
1689 break;
1690
1691 case PT_SC:
1692 OK = prop->script == code[3];
1693 break;
1694
1695 /* These are specials for combination cases. */
1696
1697 case PT_ALNUM:
1698 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1699 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1700 break;
1701
1702 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1703 which means that Perl space and POSIX space are now identical. PCRE
1704 was changed at release 8.34. */
1705
1706 case PT_SPACE: /* Perl space */
1707 case PT_PXSPACE: /* POSIX space */
1708 switch(c)
1709 {
1710 HSPACE_CASES:
1711 VSPACE_CASES:
1712 OK = TRUE;
1713 break;
1714
1715 default:
1716 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1717 break;
1718 }
1719 break;
1720
1721 case PT_WORD:
1722 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1723 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1724 c == CHAR_UNDERSCORE;
1725 break;
1726
1727 case PT_CLIST:
1728 cp = PRIV(ucd_caseless_sets) + code[3];
1729 for (;;)
1730 {
1731 if (c < *cp) { OK = FALSE; break; }
1732 if (c == *cp++) { OK = TRUE; break; }
1733 }
1734 break;
1735
1736 case PT_UCNC:
1737 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1738 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1739 c >= 0xe000;
1740 break;
1741
1742 /* Should never occur, but keep compilers from grumbling. */
1743
1744 default:
1745 OK = codevalue != OP_PROP;
1746 break;
1747 }
1748
1749 if (OK == (d == OP_PROP))
1750 {
1751 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1752 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1753 {
1754 active_count--; /* Remove non-match possibility */
1755 next_active_state--;
1756 }
1757 ADD_NEW(state_offset + count, 0);
1758 }
1759 }
1760 break;
1761
1762 /*-----------------------------------------------------------------*/
1763 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1764 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1765 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1766 count = 2;
1767 goto QS2;
1768
1769 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1770 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1771 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1772 count = 0;
1773
1774 QS2:
1775
1776 ADD_ACTIVE(state_offset + 2, 0);
1777 if (clen > 0)
1778 {
1779 int ncount = 0;
1780 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1781 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1782 {
1783 active_count--; /* Remove non-match possibility */
1784 next_active_state--;
1785 }
1786 (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1787 &ncount);
1788 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1789 }
1790 break;
1791#endif
1792
1793 /*-----------------------------------------------------------------*/
1794 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1795 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1796 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1797 count = 2;
1798 goto QS3;
1799
1800 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1801 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1802 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1803 count = 0;
1804
1805 QS3:
1806 ADD_ACTIVE(state_offset + 2, 0);
1807 if (clen > 0)
1808 {
1809 int ncount = 0;
1810 switch (c)
1811 {
1812 case CHAR_VT:
1813 case CHAR_FF:
1814 case CHAR_NEL:
1815#ifndef EBCDIC
1816 case 0x2028:
1817 case 0x2029:
1818#endif /* Not EBCDIC */
1819 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1820 goto ANYNL02;
1821
1822 case CHAR_CR:
1823 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1824 /* Fall through */
1825
1826 ANYNL02:
1827 case CHAR_LF:
1828 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1829 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1830 {
1831 active_count--; /* Remove non-match possibility */
1832 next_active_state--;
1833 }
1834 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1835 break;
1836
1837 default:
1838 break;
1839 }
1840 }
1841 break;
1842
1843 /*-----------------------------------------------------------------*/
1844 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1845 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1846 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1847 count = 2;
1848 goto QS4;
1849
1850 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1851 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1852 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1853 count = 0;
1854
1855 QS4:
1856 ADD_ACTIVE(state_offset + 2, 0);
1857 if (clen > 0)
1858 {
1859 BOOL OK;
1860 switch (c)
1861 {
1862 VSPACE_CASES:
1863 OK = TRUE;
1864 break;
1865
1866 default:
1867 OK = FALSE;
1868 break;
1869 }
1870 if (OK == (d == OP_VSPACE))
1871 {
1872 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1873 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1874 {
1875 active_count--; /* Remove non-match possibility */
1876 next_active_state--;
1877 }
1878 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1879 }
1880 }
1881 break;
1882
1883 /*-----------------------------------------------------------------*/
1884 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1885 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1886 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1887 count = 2;
1888 goto QS5;
1889
1890 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1891 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1892 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1893 count = 0;
1894
1895 QS5:
1896 ADD_ACTIVE(state_offset + 2, 0);
1897 if (clen > 0)
1898 {
1899 BOOL OK;
1900 switch (c)
1901 {
1902 HSPACE_CASES:
1903 OK = TRUE;
1904 break;
1905
1906 default:
1907 OK = FALSE;
1908 break;
1909 }
1910
1911 if (OK == (d == OP_HSPACE))
1912 {
1913 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1914 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1915 {
1916 active_count--; /* Remove non-match possibility */
1917 next_active_state--;
1918 }
1919 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1920 }
1921 }
1922 break;
1923
1924 /*-----------------------------------------------------------------*/
1925#ifdef SUPPORT_UNICODE
1926 case OP_PROP_EXTRA + OP_TYPEEXACT:
1927 case OP_PROP_EXTRA + OP_TYPEUPTO:
1928 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1929 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1930 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1931 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1932 count = current_state->count; /* Number already matched */
1933 if (clen > 0)
1934 {
1935 BOOL OK;
1936 const uint32_t *cp;
1937 const ucd_record * prop = GET_UCD(c);
1938 switch(code[1 + IMM2_SIZE + 1])
1939 {
1940 case PT_ANY:
1941 OK = TRUE;
1942 break;
1943
1944 case PT_LAMP:
1945 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1946 prop->chartype == ucp_Lt;
1947 break;
1948
1949 case PT_GC:
1950 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1951 break;
1952
1953 case PT_PC:
1954 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1955 break;
1956
1957 case PT_SC:
1958 OK = prop->script == code[1 + IMM2_SIZE + 2];
1959 break;
1960
1961 /* These are specials for combination cases. */
1962
1963 case PT_ALNUM:
1964 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1965 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1966 break;
1967
1968 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1969 which means that Perl space and POSIX space are now identical. PCRE
1970 was changed at release 8.34. */
1971
1972 case PT_SPACE: /* Perl space */
1973 case PT_PXSPACE: /* POSIX space */
1974 switch(c)
1975 {
1976 HSPACE_CASES:
1977 VSPACE_CASES:
1978 OK = TRUE;
1979 break;
1980
1981 default:
1982 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1983 break;
1984 }
1985 break;
1986
1987 case PT_WORD:
1988 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1989 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1990 c == CHAR_UNDERSCORE;
1991 break;
1992
1993 case PT_CLIST:
1994 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1995 for (;;)
1996 {
1997 if (c < *cp) { OK = FALSE; break; }
1998 if (c == *cp++) { OK = TRUE; break; }
1999 }
2000 break;
2001
2002 case PT_UCNC:
2003 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2004 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2005 c >= 0xe000;
2006 break;
2007
2008 /* Should never occur, but keep compilers from grumbling. */
2009
2010 default:
2011 OK = codevalue != OP_PROP;
2012 break;
2013 }
2014
2015 if (OK == (d == OP_PROP))
2016 {
2017 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2018 {
2019 active_count--; /* Remove non-match possibility */
2020 next_active_state--;
2021 }
2022 if (++count >= (int)GET2(code, 1))
2023 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2024 else
2025 { ADD_NEW(state_offset, count); }
2026 }
2027 }
2028 break;
2029
2030 /*-----------------------------------------------------------------*/
2031 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2032 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2033 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2034 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2035 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2036 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2037 count = current_state->count; /* Number already matched */
2038 if (clen > 0)
2039 {
2040 PCRE2_SPTR nptr;
2041 int ncount = 0;
2042 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2043 {
2044 active_count--; /* Remove non-match possibility */
2045 next_active_state--;
2046 }
2047 nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2048 &ncount);
2049 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2050 reset_could_continue = TRUE;
2051 if (++count >= (int)GET2(code, 1))
2052 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2053 else
2054 { ADD_NEW_DATA(-state_offset, count, ncount); }
2055 }
2056 break;
2057#endif
2058
2059 /*-----------------------------------------------------------------*/
2060 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2061 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2062 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2063 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2064 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2065 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2066 count = current_state->count; /* Number already matched */
2067 if (clen > 0)
2068 {
2069 int ncount = 0;
2070 switch (c)
2071 {
2072 case CHAR_VT:
2073 case CHAR_FF:
2074 case CHAR_NEL:
2075#ifndef EBCDIC
2076 case 0x2028:
2077 case 0x2029:
2078#endif /* Not EBCDIC */
2079 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2080 goto ANYNL03;
2081
2082 case CHAR_CR:
2083 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2084 /* Fall through */
2085
2086 ANYNL03:
2087 case CHAR_LF:
2088 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2089 {
2090 active_count--; /* Remove non-match possibility */
2091 next_active_state--;
2092 }
2093 if (++count >= (int)GET2(code, 1))
2094 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2095 else
2096 { ADD_NEW_DATA(-state_offset, count, ncount); }
2097 break;
2098
2099 default:
2100 break;
2101 }
2102 }
2103 break;
2104
2105 /*-----------------------------------------------------------------*/
2106 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2107 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2108 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2109 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2110 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2111 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2112 count = current_state->count; /* Number already matched */
2113 if (clen > 0)
2114 {
2115 BOOL OK;
2116 switch (c)
2117 {
2118 VSPACE_CASES:
2119 OK = TRUE;
2120 break;
2121
2122 default:
2123 OK = FALSE;
2124 }
2125
2126 if (OK == (d == OP_VSPACE))
2127 {
2128 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2129 {
2130 active_count--; /* Remove non-match possibility */
2131 next_active_state--;
2132 }
2133 if (++count >= (int)GET2(code, 1))
2134 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2135 else
2136 { ADD_NEW_DATA(-state_offset, count, 0); }
2137 }
2138 }
2139 break;
2140
2141 /*-----------------------------------------------------------------*/
2142 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2143 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2144 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2145 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2146 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2147 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2148 count = current_state->count; /* Number already matched */
2149 if (clen > 0)
2150 {
2151 BOOL OK;
2152 switch (c)
2153 {
2154 HSPACE_CASES:
2155 OK = TRUE;
2156 break;
2157
2158 default:
2159 OK = FALSE;
2160 break;
2161 }
2162
2163 if (OK == (d == OP_HSPACE))
2164 {
2165 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2166 {
2167 active_count--; /* Remove non-match possibility */
2168 next_active_state--;
2169 }
2170 if (++count >= (int)GET2(code, 1))
2171 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2172 else
2173 { ADD_NEW_DATA(-state_offset, count, 0); }
2174 }
2175 }
2176 break;
2177
2178/* ========================================================================== */
2179 /* These opcodes are followed by a character that is usually compared
2180 to the current subject character; it is loaded into d. We still get
2181 here even if there is no subject character, because in some cases zero
2182 repetitions are permitted. */
2183
2184 /*-----------------------------------------------------------------*/
2185 case OP_CHAR:
2186 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2187 break;
2188
2189 /*-----------------------------------------------------------------*/
2190 case OP_CHARI:
2191 if (clen == 0) break;
2192
2193#ifdef SUPPORT_UNICODE
2194 if (utf_or_ucp)
2195 {
2196 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2197 {
2198 unsigned int othercase;
2199 if (c < 128)
2200 othercase = fcc[c];
2201 else
2202 othercase = UCD_OTHERCASE(c);
2203 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2204 }
2205 }
2206 else
2207#endif /* SUPPORT_UNICODE */
2208 /* Not UTF or UCP mode */
2209 {
2210 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2211 { ADD_NEW(state_offset + 2, 0); }
2212 }
2213 break;
2214
2215
2216#ifdef SUPPORT_UNICODE
2217 /*-----------------------------------------------------------------*/
2218 /* This is a tricky one because it can match more than one character.
2219 Find out how many characters to skip, and then set up a negative state
2220 to wait for them to pass before continuing. */
2221
2222 case OP_EXTUNI:
2223 if (clen > 0)
2224 {
2225 int ncount = 0;
2226 PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2227 end_subject, utf, &ncount);
2228 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2229 reset_could_continue = TRUE;
2230 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2231 }
2232 break;
2233#endif
2234
2235 /*-----------------------------------------------------------------*/
2236 /* This is a tricky like EXTUNI because it too can match more than one
2237 character (when CR is followed by LF). In this case, set up a negative
2238 state to wait for one character to pass before continuing. */
2239
2240 case OP_ANYNL:
2241 if (clen > 0) switch(c)
2242 {
2243 case CHAR_VT:
2244 case CHAR_FF:
2245 case CHAR_NEL:
2246#ifndef EBCDIC
2247 case 0x2028:
2248 case 0x2029:
2249#endif /* Not EBCDIC */
2250 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2251 /* Fall through */
2252
2253 case CHAR_LF:
2254 ADD_NEW(state_offset + 1, 0);
2255 break;
2256
2257 case CHAR_CR:
2258 if (ptr + 1 >= end_subject)
2259 {
2260 ADD_NEW(state_offset + 1, 0);
2261 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2262 reset_could_continue = TRUE;
2263 }
2264 else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2265 {
2266 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2267 }
2268 else
2269 {
2270 ADD_NEW(state_offset + 1, 0);
2271 }
2272 break;
2273 }
2274 break;
2275
2276 /*-----------------------------------------------------------------*/
2277 case OP_NOT_VSPACE:
2278 if (clen > 0) switch(c)
2279 {
2280 VSPACE_CASES:
2281 break;
2282
2283 default:
2284 ADD_NEW(state_offset + 1, 0);
2285 break;
2286 }
2287 break;
2288
2289 /*-----------------------------------------------------------------*/
2290 case OP_VSPACE:
2291 if (clen > 0) switch(c)
2292 {
2293 VSPACE_CASES:
2294 ADD_NEW(state_offset + 1, 0);
2295 break;
2296
2297 default:
2298 break;
2299 }
2300 break;
2301
2302 /*-----------------------------------------------------------------*/
2303 case OP_NOT_HSPACE:
2304 if (clen > 0) switch(c)
2305 {
2306 HSPACE_CASES:
2307 break;
2308
2309 default:
2310 ADD_NEW(state_offset + 1, 0);
2311 break;
2312 }
2313 break;
2314
2315 /*-----------------------------------------------------------------*/
2316 case OP_HSPACE:
2317 if (clen > 0) switch(c)
2318 {
2319 HSPACE_CASES:
2320 ADD_NEW(state_offset + 1, 0);
2321 break;
2322
2323 default:
2324 break;
2325 }
2326 break;
2327
2328 /*-----------------------------------------------------------------*/
2329 /* Match a negated single character casefully. */
2330
2331 case OP_NOT:
2332 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2333 break;
2334
2335 /*-----------------------------------------------------------------*/
2336 /* Match a negated single character caselessly. */
2337
2338 case OP_NOTI:
2339 if (clen > 0)
2340 {
2341 uint32_t otherd;
2342#ifdef SUPPORT_UNICODE
2343 if (utf_or_ucp && d >= 128)
2344 otherd = UCD_OTHERCASE(d);
2345 else
2346#endif /* SUPPORT_UNICODE */
2347 otherd = TABLE_GET(d, fcc, d);
2348 if (c != d && c != otherd)
2349 { ADD_NEW(state_offset + dlen + 1, 0); }
2350 }
2351 break;
2352
2353 /*-----------------------------------------------------------------*/
2354 case OP_PLUSI:
2355 case OP_MINPLUSI:
2356 case OP_POSPLUSI:
2357 case OP_NOTPLUSI:
2358 case OP_NOTMINPLUSI:
2359 case OP_NOTPOSPLUSI:
2360 caseless = TRUE;
2361 codevalue -= OP_STARI - OP_STAR;
2362
2363 /* Fall through */
2364 case OP_PLUS:
2365 case OP_MINPLUS:
2366 case OP_POSPLUS:
2367 case OP_NOTPLUS:
2368 case OP_NOTMINPLUS:
2369 case OP_NOTPOSPLUS:
2370 count = current_state->count; /* Already matched */
2371 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2372 if (clen > 0)
2373 {
2374 uint32_t otherd = NOTACHAR;
2375 if (caseless)
2376 {
2377#ifdef SUPPORT_UNICODE
2378 if (utf_or_ucp && d >= 128)
2379 otherd = UCD_OTHERCASE(d);
2380 else
2381#endif /* SUPPORT_UNICODE */
2382 otherd = TABLE_GET(d, fcc, d);
2383 }
2384 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2385 {
2386 if (count > 0 &&
2387 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2388 {
2389 active_count--; /* Remove non-match possibility */
2390 next_active_state--;
2391 }
2392 count++;
2393 ADD_NEW(state_offset, count);
2394 }
2395 }
2396 break;
2397
2398 /*-----------------------------------------------------------------*/
2399 case OP_QUERYI:
2400 case OP_MINQUERYI:
2401 case OP_POSQUERYI:
2402 case OP_NOTQUERYI:
2403 case OP_NOTMINQUERYI:
2404 case OP_NOTPOSQUERYI:
2405 caseless = TRUE;
2406 codevalue -= OP_STARI - OP_STAR;
2407 /* Fall through */
2408 case OP_QUERY:
2409 case OP_MINQUERY:
2410 case OP_POSQUERY:
2411 case OP_NOTQUERY:
2412 case OP_NOTMINQUERY:
2413 case OP_NOTPOSQUERY:
2414 ADD_ACTIVE(state_offset + dlen + 1, 0);
2415 if (clen > 0)
2416 {
2417 uint32_t otherd = NOTACHAR;
2418 if (caseless)
2419 {
2420#ifdef SUPPORT_UNICODE
2421 if (utf_or_ucp && d >= 128)
2422 otherd = UCD_OTHERCASE(d);
2423 else
2424#endif /* SUPPORT_UNICODE */
2425 otherd = TABLE_GET(d, fcc, d);
2426 }
2427 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2428 {
2429 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2430 {
2431 active_count--; /* Remove non-match possibility */
2432 next_active_state--;
2433 }
2434 ADD_NEW(state_offset + dlen + 1, 0);
2435 }
2436 }
2437 break;
2438
2439 /*-----------------------------------------------------------------*/
2440 case OP_STARI:
2441 case OP_MINSTARI:
2442 case OP_POSSTARI:
2443 case OP_NOTSTARI:
2444 case OP_NOTMINSTARI:
2445 case OP_NOTPOSSTARI:
2446 caseless = TRUE;
2447 codevalue -= OP_STARI - OP_STAR;
2448 /* Fall through */
2449 case OP_STAR:
2450 case OP_MINSTAR:
2451 case OP_POSSTAR:
2452 case OP_NOTSTAR:
2453 case OP_NOTMINSTAR:
2454 case OP_NOTPOSSTAR:
2455 ADD_ACTIVE(state_offset + dlen + 1, 0);
2456 if (clen > 0)
2457 {
2458 uint32_t otherd = NOTACHAR;
2459 if (caseless)
2460 {
2461#ifdef SUPPORT_UNICODE
2462 if (utf_or_ucp && d >= 128)
2463 otherd = UCD_OTHERCASE(d);
2464 else
2465#endif /* SUPPORT_UNICODE */
2466 otherd = TABLE_GET(d, fcc, d);
2467 }
2468 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2469 {
2470 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2471 {
2472 active_count--; /* Remove non-match possibility */
2473 next_active_state--;
2474 }
2475 ADD_NEW(state_offset, 0);
2476 }
2477 }
2478 break;
2479
2480 /*-----------------------------------------------------------------*/
2481 case OP_EXACTI:
2482 case OP_NOTEXACTI:
2483 caseless = TRUE;
2484 codevalue -= OP_STARI - OP_STAR;
2485 /* Fall through */
2486 case OP_EXACT:
2487 case OP_NOTEXACT:
2488 count = current_state->count; /* Number already matched */
2489 if (clen > 0)
2490 {
2491 uint32_t otherd = NOTACHAR;
2492 if (caseless)
2493 {
2494#ifdef SUPPORT_UNICODE
2495 if (utf_or_ucp && d >= 128)
2496 otherd = UCD_OTHERCASE(d);
2497 else
2498#endif /* SUPPORT_UNICODE */
2499 otherd = TABLE_GET(d, fcc, d);
2500 }
2501 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2502 {
2503 if (++count >= (int)GET2(code, 1))
2504 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2505 else
2506 { ADD_NEW(state_offset, count); }
2507 }
2508 }
2509 break;
2510
2511 /*-----------------------------------------------------------------*/
2512 case OP_UPTOI:
2513 case OP_MINUPTOI:
2514 case OP_POSUPTOI:
2515 case OP_NOTUPTOI:
2516 case OP_NOTMINUPTOI:
2517 case OP_NOTPOSUPTOI:
2518 caseless = TRUE;
2519 codevalue -= OP_STARI - OP_STAR;
2520 /* Fall through */
2521 case OP_UPTO:
2522 case OP_MINUPTO:
2523 case OP_POSUPTO:
2524 case OP_NOTUPTO:
2525 case OP_NOTMINUPTO:
2526 case OP_NOTPOSUPTO:
2527 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2528 count = current_state->count; /* Number already matched */
2529 if (clen > 0)
2530 {
2531 uint32_t otherd = NOTACHAR;
2532 if (caseless)
2533 {
2534#ifdef SUPPORT_UNICODE
2535 if (utf_or_ucp && d >= 128)
2536 otherd = UCD_OTHERCASE(d);
2537 else
2538#endif /* SUPPORT_UNICODE */
2539 otherd = TABLE_GET(d, fcc, d);
2540 }
2541 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2542 {
2543 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2544 {
2545 active_count--; /* Remove non-match possibility */
2546 next_active_state--;
2547 }
2548 if (++count >= (int)GET2(code, 1))
2549 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2550 else
2551 { ADD_NEW(state_offset, count); }
2552 }
2553 }
2554 break;
2555
2556
2557/* ========================================================================== */
2558 /* These are the class-handling opcodes */
2559
2560 case OP_CLASS:
2561 case OP_NCLASS:
2562 case OP_XCLASS:
2563 {
2564 BOOL isinclass = FALSE;
2565 int next_state_offset;
2566 PCRE2_SPTR ecode;
2567
2568 /* For a simple class, there is always just a 32-byte table, and we
2569 can set isinclass from it. */
2570
2571 if (codevalue != OP_XCLASS)
2572 {
2573 ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2574 if (clen > 0)
2575 {
2576 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2577 ((((uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0);
2578 }
2579 }
2580
2581 /* An extended class may have a table or a list of single characters,
2582 ranges, or both, and it may be positive or negative. There's a
2583 function that sorts all this out. */
2584
2585 else
2586 {
2587 ecode = code + GET(code, 1);
2588 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2589 }
2590
2591 /* At this point, isinclass is set for all kinds of class, and ecode
2592 points to the byte after the end of the class. If there is a
2593 quantifier, this is where it will be. */
2594
2595 next_state_offset = (int)(ecode - start_code);
2596
2597 switch (*ecode)
2598 {
2599 case OP_CRSTAR:
2600 case OP_CRMINSTAR:
2601 case OP_CRPOSSTAR:
2602 ADD_ACTIVE(next_state_offset + 1, 0);
2603 if (isinclass)
2604 {
2605 if (*ecode == OP_CRPOSSTAR)
2606 {
2607 active_count--; /* Remove non-match possibility */
2608 next_active_state--;
2609 }
2610 ADD_NEW(state_offset, 0);
2611 }
2612 break;
2613
2614 case OP_CRPLUS:
2615 case OP_CRMINPLUS:
2616 case OP_CRPOSPLUS:
2617 count = current_state->count; /* Already matched */
2618 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2619 if (isinclass)
2620 {
2621 if (count > 0 && *ecode == OP_CRPOSPLUS)
2622 {
2623 active_count--; /* Remove non-match possibility */
2624 next_active_state--;
2625 }
2626 count++;
2627 ADD_NEW(state_offset, count);
2628 }
2629 break;
2630
2631 case OP_CRQUERY:
2632 case OP_CRMINQUERY:
2633 case OP_CRPOSQUERY:
2634 ADD_ACTIVE(next_state_offset + 1, 0);
2635 if (isinclass)
2636 {
2637 if (*ecode == OP_CRPOSQUERY)
2638 {
2639 active_count--; /* Remove non-match possibility */
2640 next_active_state--;
2641 }
2642 ADD_NEW(next_state_offset + 1, 0);
2643 }
2644 break;
2645
2646 case OP_CRRANGE:
2647 case OP_CRMINRANGE:
2648 case OP_CRPOSRANGE:
2649 count = current_state->count; /* Already matched */
2650 if (count >= (int)GET2(ecode, 1))
2651 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2652 if (isinclass)
2653 {
2654 int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2655
2656 if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2657 {
2658 active_count--; /* Remove non-match possibility */
2659 next_active_state--;
2660 }
2661
2662 if (++count >= max && max != 0) /* Max 0 => no limit */
2663 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2664 else
2665 { ADD_NEW(state_offset, count); }
2666 }
2667 break;
2668
2669 default:
2670 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2671 break;
2672 }
2673 }
2674 break;
2675
2676/* ========================================================================== */
2677 /* These are the opcodes for fancy brackets of various kinds. We have
2678 to use recursion in order to handle them. The "always failing" assertion
2679 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2680 though the other "backtracking verbs" are not supported. */
2681
2682 case OP_FAIL:
2683 forced_fail++; /* Count FAILs for multiple states */
2684 break;
2685
2686 case OP_ASSERT:
2687 case OP_ASSERT_NOT:
2688 case OP_ASSERTBACK:
2689 case OP_ASSERTBACK_NOT:
2690 {
2691 int rc;
2692 int *local_workspace;
2693 PCRE2_SIZE *local_offsets;
2694 PCRE2_SPTR endasscode = code + GET(code, 1);
2695 RWS_anchor *rws = (RWS_anchor *)RWS;
2696
2697 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2698 {
2699 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2700 if (rc != 0) return rc;
2701 RWS = (int *)rws;
2702 }
2703
2704 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2705 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2706 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2707
2708 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2709
2710 rc = internal_dfa_match(
2711 mb, /* static match data */
2712 code, /* this subexpression's code */
2713 ptr, /* where we currently are */
2714 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2715 local_offsets, /* offset vector */
2716 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2717 local_workspace, /* workspace vector */
2718 RWS_RSIZE, /* size of same */
2719 rlevel, /* function recursion level */
2720 RWS); /* recursion workspace */
2721
2722 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2723
2724 if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2725 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2726 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2727 }
2728 break;
2729
2730 /*-----------------------------------------------------------------*/
2731 case OP_COND:
2732 case OP_SCOND:
2733 {
2734 int codelink = (int)GET(code, 1);
2735 PCRE2_UCHAR condcode;
2736
2737 /* Because of the way auto-callout works during compile, a callout item
2738 is inserted between OP_COND and an assertion condition. This does not
2739 happen for the other conditions. */
2740
2741 if (code[LINK_SIZE + 1] == OP_CALLOUT
2742 || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2743 {
2744 PCRE2_SIZE callout_length;
2745 rrc = do_callout(code, offsets, current_subject, ptr, mb,
2746 1 + LINK_SIZE, &callout_length);
2747 if (rrc < 0) return rrc; /* Abandon */
2748 if (rrc > 0) break; /* Fail this thread */
2749 code += callout_length; /* Skip callout data */
2750 }
2751
2752 condcode = code[LINK_SIZE+1];
2753
2754 /* Back reference conditions and duplicate named recursion conditions
2755 are not supported */
2756
2757 if (condcode == OP_CREF || condcode == OP_DNCREF ||
2758 condcode == OP_DNRREF)
2759 return PCRE2_ERROR_DFA_UCOND;
2760
2761 /* The DEFINE condition is always false, and the assertion (?!) is
2762 converted to OP_FAIL. */
2763
2764 if (condcode == OP_FALSE || condcode == OP_FAIL)
2765 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2766
2767 /* There is also an always-true condition */
2768
2769 else if (condcode == OP_TRUE)
2770 { ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); }
2771
2772 /* The only supported version of OP_RREF is for the value RREF_ANY,
2773 which means "test if in any recursion". We can't test for specifically
2774 recursed groups. */
2775
2776 else if (condcode == OP_RREF)
2777 {
2778 unsigned int value = GET2(code, LINK_SIZE + 2);
2779 if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2780 if (mb->recursive != NULL)
2781 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2782 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2783 }
2784
2785 /* Otherwise, the condition is an assertion */
2786
2787 else
2788 {
2789 int rc;
2790 int *local_workspace;
2791 PCRE2_SIZE *local_offsets;
2792 PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2793 PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2794 RWS_anchor *rws = (RWS_anchor *)RWS;
2795
2796 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2797 {
2798 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2799 if (rc != 0) return rc;
2800 RWS = (int *)rws;
2801 }
2802
2803 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2804 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2805 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2806
2807 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2808
2809 rc = internal_dfa_match(
2810 mb, /* fixed match data */
2811 asscode, /* this subexpression's code */
2812 ptr, /* where we currently are */
2813 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2814 local_offsets, /* offset vector */
2815 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2816 local_workspace, /* workspace vector */
2817 RWS_RSIZE, /* size of same */
2818 rlevel, /* function recursion level */
2819 RWS); /* recursion workspace */
2820
2821 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2822
2823 if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2824 if ((rc >= 0) ==
2825 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2826 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2827 else
2828 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2829 }
2830 }
2831 break;
2832
2833 /*-----------------------------------------------------------------*/
2834 case OP_RECURSE:
2835 {
2836 int rc;
2837 int *local_workspace;
2838 PCRE2_SIZE *local_offsets;
2839 RWS_anchor *rws = (RWS_anchor *)RWS;
2840 dfa_recursion_info *ri;
2841 PCRE2_SPTR callpat = start_code + GET(code, 1);
2842 uint32_t recno = (callpat == mb->start_code)? 0 :
2843 GET2(callpat, 1 + LINK_SIZE);
2844
2845 if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2846 {
2847 rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2848 if (rc != 0) return rc;
2849 RWS = (int *)rws;
2850 }
2851
2852 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2853 local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2854 rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2855
2856 /* Check for repeating a recursion without advancing the subject
2857 pointer. This should catch convoluted mutual recursions. (Some simple
2858 cases are caught at compile time.) */
2859
2860 for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
2861 if (recno == ri->group_num && ptr == ri->subject_position)
2862 return PCRE2_ERROR_RECURSELOOP;
2863
2864 /* Remember this recursion and where we started it so as to
2865 catch infinite loops. */
2866
2867 new_recursive.group_num = recno;
2868 new_recursive.subject_position = ptr;
2869 new_recursive.prevrec = mb->recursive;
2870 mb->recursive = &new_recursive;
2871
2872 rc = internal_dfa_match(
2873 mb, /* fixed match data */
2874 callpat, /* this subexpression's code */
2875 ptr, /* where we currently are */
2876 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2877 local_offsets, /* offset vector */
2878 RWS_OVEC_RSIZE/OVEC_UNIT, /* size of same */
2879 local_workspace, /* workspace vector */
2880 RWS_RSIZE, /* size of same */
2881 rlevel, /* function recursion level */
2882 RWS); /* recursion workspace */
2883
2884 rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2885 mb->recursive = new_recursive.prevrec; /* Done this recursion */
2886
2887 /* Ran out of internal offsets */
2888
2889 if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2890
2891 /* For each successful matched substring, set up the next state with a
2892 count of characters to skip before trying it. Note that the count is in
2893 characters, not bytes. */
2894
2895 if (rc > 0)
2896 {
2897 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2898 {
2899 PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
2900#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2901 if (utf)
2902 {
2903 PCRE2_SPTR p = start_subject + local_offsets[rc];
2904 PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
2905 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2906 }
2907#endif
2908 if (charcount > 0)
2909 {
2910 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
2911 (int)(charcount - 1));
2912 }
2913 else
2914 {
2915 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2916 }
2917 }
2918 }
2919 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2920 }
2921 break;
2922
2923 /*-----------------------------------------------------------------*/
2924 case OP_BRAPOS:
2925 case OP_SBRAPOS:
2926 case OP_CBRAPOS:
2927 case OP_SCBRAPOS:
2928 case OP_BRAPOSZERO:
2929 {
2930 int rc;
2931 int *local_workspace;
2932 PCRE2_SIZE *local_offsets;
2933 PCRE2_SIZE charcount, matched_count;
2934 PCRE2_SPTR local_ptr = ptr;
2935 RWS_anchor *rws = (RWS_anchor *)RWS;
2936 BOOL allow_zero;
2937
2938 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2939 {
2940 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2941 if (rc != 0) return rc;
2942 RWS = (int *)rws;
2943 }
2944
2945 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2946 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2947 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2948
2949 if (codevalue == OP_BRAPOSZERO)
2950 {
2951 allow_zero = TRUE;
2952 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2953 }
2954 else allow_zero = FALSE;
2955
2956 /* Loop to match the subpattern as many times as possible as if it were
2957 a complete pattern. */
2958
2959 for (matched_count = 0;; matched_count++)
2960 {
2961 rc = internal_dfa_match(
2962 mb, /* fixed match data */
2963 code, /* this subexpression's code */
2964 local_ptr, /* where we currently are */
2965 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2966 local_offsets, /* offset vector */
2967 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2968 local_workspace, /* workspace vector */
2969 RWS_RSIZE, /* size of same */
2970 rlevel, /* function recursion level */
2971 RWS); /* recursion workspace */
2972
2973 /* Failed to match */
2974
2975 if (rc < 0)
2976 {
2977 if (rc != PCRE2_ERROR_NOMATCH) return rc;
2978 break;
2979 }
2980
2981 /* Matched: break the loop if zero characters matched. */
2982
2983 charcount = local_offsets[1] - local_offsets[0];
2984 if (charcount == 0) break;
2985 local_ptr += charcount; /* Advance temporary position ptr */
2986 }
2987
2988 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2989
2990 /* At this point we have matched the subpattern matched_count
2991 times, and local_ptr is pointing to the character after the end of the
2992 last match. */
2993
2994 if (matched_count > 0 || allow_zero)
2995 {
2996 PCRE2_SPTR end_subpattern = code;
2997 int next_state_offset;
2998
2999 do { end_subpattern += GET(end_subpattern, 1); }
3000 while (*end_subpattern == OP_ALT);
3001 next_state_offset =
3002 (int)(end_subpattern - start_code + LINK_SIZE + 1);
3003
3004 /* Optimization: if there are no more active states, and there
3005 are no new states yet set up, then skip over the subject string
3006 right here, to save looping. Otherwise, set up the new state to swing
3007 into action when the end of the matched substring is reached. */
3008
3009 if (i + 1 >= active_count && new_count == 0)
3010 {
3011 ptr = local_ptr;
3012 clen = 0;
3013 ADD_NEW(next_state_offset, 0);
3014 }
3015 else
3016 {
3017 PCRE2_SPTR p = ptr;
3018 PCRE2_SPTR pp = local_ptr;
3019 charcount = (PCRE2_SIZE)(pp - p);
3020#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3021 if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3022#endif
3023 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3024 }
3025 }
3026 }
3027 break;
3028
3029 /*-----------------------------------------------------------------*/
3030 case OP_ONCE:
3031 {
3032 int rc;
3033 int *local_workspace;
3034 PCRE2_SIZE *local_offsets;
3035 RWS_anchor *rws = (RWS_anchor *)RWS;
3036
3037 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3038 {
3039 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3040 if (rc != 0) return rc;
3041 RWS = (int *)rws;
3042 }
3043
3044 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3045 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3046 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3047
3048 rc = internal_dfa_match(
3049 mb, /* fixed match data */
3050 code, /* this subexpression's code */
3051 ptr, /* where we currently are */
3052 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
3053 local_offsets, /* offset vector */
3054 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
3055 local_workspace, /* workspace vector */
3056 RWS_RSIZE, /* size of same */
3057 rlevel, /* function recursion level */
3058 RWS); /* recursion workspace */
3059
3060 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3061
3062 if (rc >= 0)
3063 {
3064 PCRE2_SPTR end_subpattern = code;
3065 PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3066 int next_state_offset, repeat_state_offset;
3067
3068 do { end_subpattern += GET(end_subpattern, 1); }
3069 while (*end_subpattern == OP_ALT);
3070 next_state_offset =
3071 (int)(end_subpattern - start_code + LINK_SIZE + 1);
3072
3073 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3074 arrange for the repeat state also to be added to the relevant list.
3075 Calculate the offset, or set -1 for no repeat. */
3076
3077 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3078 *end_subpattern == OP_KETRMIN)?
3079 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3080
3081 /* If we have matched an empty string, add the next state at the
3082 current character pointer. This is important so that the duplicate
3083 checking kicks in, which is what breaks infinite loops that match an
3084 empty string. */
3085
3086 if (charcount == 0)
3087 {
3088 ADD_ACTIVE(next_state_offset, 0);
3089 }
3090
3091 /* Optimization: if there are no more active states, and there
3092 are no new states yet set up, then skip over the subject string
3093 right here, to save looping. Otherwise, set up the new state to swing
3094 into action when the end of the matched substring is reached. */
3095
3096 else if (i + 1 >= active_count && new_count == 0)
3097 {
3098 ptr += charcount;
3099 clen = 0;
3100 ADD_NEW(next_state_offset, 0);
3101
3102 /* If we are adding a repeat state at the new character position,
3103 we must fudge things so that it is the only current state.
3104 Otherwise, it might be a duplicate of one we processed before, and
3105 that would cause it to be skipped. */
3106
3107 if (repeat_state_offset >= 0)
3108 {
3109 next_active_state = active_states;
3110 active_count = 0;
3111 i = -1;
3112 ADD_ACTIVE(repeat_state_offset, 0);
3113 }
3114 }
3115 else
3116 {
3117#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3118 if (utf)
3119 {
3120 PCRE2_SPTR p = start_subject + local_offsets[0];
3121 PCRE2_SPTR pp = start_subject + local_offsets[1];
3122 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3123 }
3124#endif
3125 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3126 if (repeat_state_offset >= 0)
3127 { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3128 }
3129 }
3130 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3131 }
3132 break;
3133
3134
3135/* ========================================================================== */
3136 /* Handle callouts */
3137
3138 case OP_CALLOUT:
3139 case OP_CALLOUT_STR:
3140 {
3141 PCRE2_SIZE callout_length;
3142 rrc = do_callout(code, offsets, current_subject, ptr, mb, 0,
3143 &callout_length);
3144 if (rrc < 0) return rrc; /* Abandon */
3145 if (rrc == 0)
3146 { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3147 }
3148 break;
3149
3150
3151/* ========================================================================== */
3152 default: /* Unsupported opcode */
3153 return PCRE2_ERROR_DFA_UITEM;
3154 }
3155
3156 NEXT_ACTIVE_STATE: continue;
3157
3158 } /* End of loop scanning active states */
3159
3160 /* We have finished the processing at the current subject character. If no
3161 new states have been set for the next character, we have found all the
3162 matches that we are going to find. If partial matching has been requested,
3163 check for appropriate conditions.
3164
3165 The "forced_ fail" variable counts the number of (*F) encountered for the
3166 character. If it is equal to the original active_count (saved in
3167 workspace[1]) it means that (*F) was found on every active state. In this
3168 case we don't want to give a partial match.
3169
3170 The "could_continue" variable is true if a state could have continued but
3171 for the fact that the end of the subject was reached. */
3172
3173 if (new_count <= 0)
3174 {
3175 if (could_continue && /* Some could go on, and */
3176 forced_fail != workspace[1] && /* Not all forced fail & */
3177 ( /* either... */
3178 (mb->moptions & PCRE2_PARTIAL_HARD) != 0 /* Hard partial */
3179 || /* or... */
3180 ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 && /* Soft partial and */
3181 match_count < 0) /* no matches */
3182 ) && /* And... */
3183 (
3184 partial_newline || /* Either partial NL */
3185 ( /* or ... */
3186 ptr >= end_subject && /* End of subject and */
3187 ( /* either */
3188 ptr > mb->start_used_ptr || /* Inspected non-empty string */
3189 mb->allowemptypartial /* or pattern has lookbehind */
3190 ) /* or could match empty */
3191 )
3192 ))
3193 match_count = PCRE2_ERROR_PARTIAL;
3194 break; /* Exit from loop along the subject string */
3195 }
3196
3197 /* One or more states are active for the next character. */
3198
3199 ptr += clen; /* Advance to next subject character */
3200 } /* Loop to move along the subject string */
3201
3202/* Control gets here from "break" a few lines above. If we have a match and
3203PCRE2_ENDANCHORED is set, the match fails. */
3204
3205if (match_count >= 0 &&
3206 ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3207 ptr < end_subject)
3208 match_count = PCRE2_ERROR_NOMATCH;
3209
3210return match_count;
3211}
3212
3213
3214
3215/*************************************************
3216* Match a pattern using the DFA algorithm *
3217*************************************************/
3218
3219/* This function matches a compiled pattern to a subject string, using the
3220alternate matching algorithm that finds all matches at once.
3221
3222Arguments:
3223 code points to the compiled pattern
3224 subject subject string
3225 length length of subject string
3226 startoffset where to start matching in the subject
3227 options option bits
3228 match_data points to a match data structure
3229 gcontext points to a match context
3230 workspace pointer to workspace
3231 wscount size of workspace
3232
3233Returns: > 0 => number of match offset pairs placed in offsets
3234 = 0 => offsets overflowed; longest matches are present
3235 -1 => failed to match
3236 < -1 => some kind of unexpected problem
3237*/
3238
3239PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
3240pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3241 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3242 pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3243{
3244int rc;
3245int was_zero_terminated = 0;
3246
3247const pcre2_real_code *re = (const pcre2_real_code *)code;
3248
3249PCRE2_SPTR start_match;
3250PCRE2_SPTR end_subject;
3251PCRE2_SPTR bumpalong_limit;
3252PCRE2_SPTR req_cu_ptr;
3253
3254BOOL utf, anchored, startline, firstline;
3255BOOL has_first_cu = FALSE;
3256BOOL has_req_cu = FALSE;
3257
3258#if PCRE2_CODE_UNIT_WIDTH == 8
3259BOOL memchr_not_found_first_cu = FALSE;
3260BOOL memchr_not_found_first_cu2 = FALSE;
3261#endif
3262
3263PCRE2_UCHAR first_cu = 0;
3264PCRE2_UCHAR first_cu2 = 0;
3265PCRE2_UCHAR req_cu = 0;
3266PCRE2_UCHAR req_cu2 = 0;
3267
3268const uint8_t *start_bits = NULL;
3269
3270/* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3271is used below, and it expects NLBLOCK to be defined as a pointer. */
3272
3273pcre2_callout_block cb;
3274dfa_match_block actual_match_block;
3275dfa_match_block *mb = &actual_match_block;
3276
3277/* Set up a starting block of memory for use during recursive calls to
3278internal_dfa_match(). By putting this on the stack, it minimizes resource use
3279in the case when it is not needed. If this is too small, more memory is
3280obtained from the heap. At the start of each block is an anchor structure.*/
3281
3282int base_recursion_workspace[RWS_BASE_SIZE];
3283RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3284rws->next = NULL;
3285rws->size = RWS_BASE_SIZE;
3286rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3287
3288/* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
3289subject string. */
3290
3291if (length == PCRE2_ZERO_TERMINATED)
3292 {
3293 length = PRIV(strlen)(subject);
3294 was_zero_terminated = 1;
3295 }
3296
3297/* Plausibility checks */
3298
3299if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3300if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3301 return PCRE2_ERROR_NULL;
3302if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3303if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3304
3305/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3306time. */
3307
3308if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3309 ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3310 return PCRE2_ERROR_BADOPTION;
3311
3312/* Invalid UTF support is not available for DFA matching. */
3313
3314if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
3315 return PCRE2_ERROR_DFA_UINVALID_UTF;
3316
3317/* Check that the first field in the block is the magic number. If it is not,
3318return with PCRE2_ERROR_BADMAGIC. */
3319
3320if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3321
3322/* Check the code unit width. */
3323
3324if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3325 return PCRE2_ERROR_BADMODE;
3326
3327/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3328options variable for this function. Users of PCRE2 who are not calling the
3329function directly would like to have a way of setting these flags, in the same
3330way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3331constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3332(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3333transferred to the options for this function. The bits are guaranteed to be
3334adjacent, but do not have the same values. This bit of Boolean trickery assumes
3335that the match-time bits are not more significant than the flag bits. If by
3336accident this is not the case, a compile-time division by zero error will
3337occur. */
3338
3339#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3340#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3341options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3342#undef FF
3343#undef OO
3344
3345/* If restarting after a partial match, do some sanity checks on the contents
3346of the workspace. */
3347
3348if ((options & PCRE2_DFA_RESTART) != 0)
3349 {
3350 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3351 workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3352 return PCRE2_ERROR_DFA_BADRESTART;
3353 }
3354
3355/* Set some local values */
3356
3357utf = (re->overall_options & PCRE2_UTF) != 0;
3358start_match = subject + start_offset;
3359end_subject = subject + length;
3360req_cu_ptr = start_match - 1;
3361anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3362 (re->overall_options & PCRE2_ANCHORED) != 0;
3363
3364/* The "must be at the start of a line" flags are used in a loop when finding
3365where to start. */
3366
3367startline = (re->flags & PCRE2_STARTLINE) != 0;
3368firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
3369bumpalong_limit = end_subject;
3370
3371/* Initialize and set up the fixed fields in the callout block, with a pointer
3372in the match block. */
3373
3374mb->cb = &cb;
3375cb.version = 2;
3376cb.subject = subject;
3377cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3378cb.callout_flags = 0;
3379cb.capture_top = 1; /* No capture support */
3380cb.capture_last = 0;
3381cb.mark = NULL; /* No (*MARK) support */
3382
3383/* Get data from the match context, if present, and fill in the remaining
3384fields in the match block. It is an error to set an offset limit without
3385setting the flag at compile time. */
3386
3387if (mcontext == NULL)
3388 {
3389 mb->callout = NULL;
3390 mb->memctl = re->memctl;
3391 mb->match_limit = PRIV(default_match_context).match_limit;
3392 mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3393 mb->heap_limit = PRIV(default_match_context).heap_limit;
3394 }
3395else
3396 {
3397 if (mcontext->offset_limit != PCRE2_UNSET)
3398 {
3399 if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3400 return PCRE2_ERROR_BADOFFSETLIMIT;
3401 bumpalong_limit = subject + mcontext->offset_limit;
3402 }
3403 mb->callout = mcontext->callout;
3404 mb->callout_data = mcontext->callout_data;
3405 mb->memctl = mcontext->memctl;
3406 mb->match_limit = mcontext->match_limit;
3407 mb->match_limit_depth = mcontext->depth_limit;
3408 mb->heap_limit = mcontext->heap_limit;
3409 }
3410
3411if (mb->match_limit > re->limit_match)
3412 mb->match_limit = re->limit_match;
3413
3414if (mb->match_limit_depth > re->limit_depth)
3415 mb->match_limit_depth = re->limit_depth;
3416
3417if (mb->heap_limit > re->limit_heap)
3418 mb->heap_limit = re->limit_heap;
3419
3420mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3421 re->name_count * re->name_entry_size;
3422mb->tables = re->tables;
3423mb->start_subject = subject;
3424mb->end_subject = end_subject;
3425mb->start_offset = start_offset;
3426mb->allowemptypartial = (re->max_lookbehind > 0) ||
3427 (re->flags & PCRE2_MATCH_EMPTY) != 0;
3428mb->moptions = options;
3429mb->poptions = re->overall_options;
3430mb->match_call_count = 0;
3431mb->heap_used = 0;
3432
3433/* Process the \R and newline settings. */
3434
3435mb->bsr_convention = re->bsr_convention;
3436mb->nltype = NLTYPE_FIXED;
3437switch(re->newline_convention)
3438 {
3439 case PCRE2_NEWLINE_CR:
3440 mb->nllen = 1;
3441 mb->nl[0] = CHAR_CR;
3442 break;
3443
3444 case PCRE2_NEWLINE_LF:
3445 mb->nllen = 1;
3446 mb->nl[0] = CHAR_NL;
3447 break;
3448
3449 case PCRE2_NEWLINE_NUL:
3450 mb->nllen = 1;
3451 mb->nl[0] = CHAR_NUL;
3452 break;
3453
3454 case PCRE2_NEWLINE_CRLF:
3455 mb->nllen = 2;
3456 mb->nl[0] = CHAR_CR;
3457 mb->nl[1] = CHAR_NL;
3458 break;
3459
3460 case PCRE2_NEWLINE_ANY:
3461 mb->nltype = NLTYPE_ANY;
3462 break;
3463
3464 case PCRE2_NEWLINE_ANYCRLF:
3465 mb->nltype = NLTYPE_ANYCRLF;
3466 break;
3467
3468 default: return PCRE2_ERROR_INTERNAL;
3469 }
3470
3471/* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3472we must also check that a starting offset does not point into the middle of a
3473multiunit character. We check only the portion of the subject that is going to
3474be inspected during matching - from the offset minus the maximum back reference
3475to the given length. This saves time when a small part of a large subject is
3476being matched by the use of a starting offset. Note that the maximum lookbehind
3477is a number of characters, not code units. */
3478
3479#ifdef SUPPORT_UNICODE
3480if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3481 {
3482 PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
3483
3484 if (start_offset > 0)
3485 {
3486#if PCRE2_CODE_UNIT_WIDTH != 32
3487 unsigned int i;
3488 if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3489 return PCRE2_ERROR_BADUTFOFFSET;
3490 for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3491 {
3492 check_subject--;
3493 while (check_subject > subject &&
3494#if PCRE2_CODE_UNIT_WIDTH == 8
3495 (*check_subject & 0xc0) == 0x80)
3496#else /* 16-bit */
3497 (*check_subject & 0xfc00) == 0xdc00)
3498#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3499 check_subject--;
3500 }
3501#else /* In the 32-bit library, one code unit equals one character. */
3502 check_subject -= re->max_lookbehind;
3503 if (check_subject < subject) check_subject = subject;
3504#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
3505 }
3506
3507 /* Validate the relevant portion of the subject. After an error, adjust the
3508 offset to be an absolute offset in the whole string. */
3509
3510 match_data->rc = PRIV(valid_utf)(check_subject,
3511 length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3512 if (match_data->rc != 0)
3513 {
3514 match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3515 return match_data->rc;
3516 }
3517 }
3518#endif /* SUPPORT_UNICODE */
3519
3520/* Set up the first code unit to match, if available. If there's no first code
3521unit there may be a bitmap of possible first characters. */
3522
3523if ((re->flags & PCRE2_FIRSTSET) != 0)
3524 {
3525 has_first_cu = TRUE;
3526 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3527 if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3528 {
3529 first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3530#ifdef SUPPORT_UNICODE
3531#if PCRE2_CODE_UNIT_WIDTH == 8
3532 if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3533 first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3534#else
3535 if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3536 first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3537#endif
3538#endif /* SUPPORT_UNICODE */
3539 }
3540 }
3541else
3542 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3543 start_bits = re->start_bitmap;
3544
3545/* There may be a "last known required code unit" set. */
3546
3547if ((re->flags & PCRE2_LASTSET) != 0)
3548 {
3549 has_req_cu = TRUE;
3550 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3551 if ((re->flags & PCRE2_LASTCASELESS) != 0)
3552 {
3553 req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3554#ifdef SUPPORT_UNICODE
3555#if PCRE2_CODE_UNIT_WIDTH == 8
3556 if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3557 req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3558#else
3559 if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3560 req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3561#endif
3562#endif /* SUPPORT_UNICODE */
3563 }
3564 }
3565
3566/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
3567free the memory that was obtained. */
3568
3569if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
3570 {
3571 match_data->memctl.free((void *)match_data->subject,
3572 match_data->memctl.memory_data);
3573 match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
3574 }
3575
3576/* Fill in fields that are always returned in the match data. */
3577
3578match_data->code = re;
3579match_data->subject = NULL; /* Default for no match */
3580match_data->mark = NULL;
3581match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3582
3583/* Call the main matching function, looping for a non-anchored regex after a
3584failed match. If not restarting, perform certain optimizations at the start of
3585a match. */
3586
3587for (;;)
3588 {
3589 /* ----------------- Start of match optimizations ---------------- */
3590
3591 /* There are some optimizations that avoid running the match if a known
3592 starting point is not found, or if a known later code unit is not present.
3593 However, there is an option (settable at compile time) that disables
3594 these, for testing and for ensuring that all callouts do actually occur.
3595 The optimizations must also be avoided when restarting a DFA match. */
3596
3597 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3598 (options & PCRE2_DFA_RESTART) == 0)
3599 {
3600 /* If firstline is TRUE, the start of the match is constrained to the first
3601 line of a multiline string. That is, the match must be before or at the
3602 first newline following the start of matching. Temporarily adjust
3603 end_subject so that we stop the optimization scans for a first code unit
3604 immediately after the first character of a newline (the first code unit can
3605 legitimately be a newline). If the match fails at the newline, later code
3606 breaks this loop. */
3607
3608 if (firstline)
3609 {
3610 PCRE2_SPTR t = start_match;
3611#ifdef SUPPORT_UNICODE
3612 if (utf)
3613 {
3614 while (t < end_subject && !IS_NEWLINE(t))
3615 {
3616 t++;
3617 ACROSSCHAR(t < end_subject, t, t++);
3618 }
3619 }
3620 else
3621#endif
3622 while (t < end_subject && !IS_NEWLINE(t)) t++;
3623 end_subject = t;
3624 }
3625
3626 /* Anchored: check the first code unit if one is recorded. This may seem
3627 pointless but it can help in detecting a no match case without scanning for
3628 the required code unit. */
3629
3630 if (anchored)
3631 {
3632 if (has_first_cu || start_bits != NULL)
3633 {
3634 BOOL ok = start_match < end_subject;
3635 if (ok)
3636 {
3637 PCRE2_UCHAR c = UCHAR21TEST(start_match);
3638 ok = has_first_cu && (c == first_cu || c == first_cu2);
3639 if (!ok && start_bits != NULL)
3640 {
3641#if PCRE2_CODE_UNIT_WIDTH != 8
3642 if (c > 255) c = 255;
3643#endif
3644 ok = (start_bits[c/8] & (1u << (c&7))) != 0;
3645 }
3646 }
3647 if (!ok) break;
3648 }
3649 }
3650
3651 /* Not anchored. Advance to a unique first code unit if there is one. In
3652 8-bit mode, the use of memchr() gives a big speed up, even though we have
3653 to call it twice in caseless mode, in order to find the earliest occurrence
3654 of the character in either of its cases. If a call to memchr() that
3655 searches the rest of the subject fails to find one case, remember that in
3656 order not to keep on repeating the search. This can make a huge difference
3657 when the strings are very long and only one case is present. */
3658
3659 else
3660 {
3661 if (has_first_cu)
3662 {
3663 if (first_cu != first_cu2) /* Caseless */
3664 {
3665#if PCRE2_CODE_UNIT_WIDTH != 8
3666 PCRE2_UCHAR smc;
3667 while (start_match < end_subject &&
3668 (smc = UCHAR21TEST(start_match)) != first_cu &&
3669 smc != first_cu2)
3670 start_match++;
3671
3672#else /* 8-bit code units */
3673 PCRE2_SPTR pp1 = NULL;
3674 PCRE2_SPTR pp2 = NULL;
3675 PCRE2_SIZE cu2size = end_subject - start_match;
3676
3677 if (!memchr_not_found_first_cu)
3678 {
3679 pp1 = memchr(start_match, first_cu, end_subject - start_match);
3680 if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
3681 else cu2size = pp1 - start_match;
3682 }
3683
3684 /* If pp1 is not NULL, we have arranged to search only as far as pp1,
3685 to see if the other case is earlier, so we can set "not found" only
3686 when both searches have returned NULL. */
3687
3688 if (!memchr_not_found_first_cu2)
3689 {
3690 pp2 = memchr(start_match, first_cu2, cu2size);
3691 memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
3692 }
3693
3694 if (pp1 == NULL)
3695 start_match = (pp2 == NULL)? end_subject : pp2;
3696 else
3697 start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3698#endif
3699 }
3700
3701 /* The caseful case */
3702
3703 else
3704 {
3705#if PCRE2_CODE_UNIT_WIDTH != 8
3706 while (start_match < end_subject && UCHAR21TEST(start_match) !=
3707 first_cu)
3708 start_match++;
3709#else /* 8-bit code units */
3710 start_match = memchr(start_match, first_cu, end_subject - start_match);
3711 if (start_match == NULL) start_match = end_subject;
3712#endif
3713 }
3714
3715 /* If we can't find the required code unit, having reached the true end
3716 of the subject, break the bumpalong loop, to force a match failure,
3717 except when doing partial matching, when we let the next cycle run at
3718 the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3719 which partially matches "abc", even though the string does not contain
3720 the starting character "d". If we have not reached the true end of the
3721 subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3722 we also let the cycle run, because the matching string is legitimately
3723 allowed to start with the first code unit of a newline. */
3724
3725 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3726 start_match >= mb->end_subject)
3727 break;
3728 }
3729
3730 /* If there's no first code unit, advance to just after a linebreak for a
3731 multiline match if required. */
3732
3733 else if (startline)
3734 {
3735 if (start_match > mb->start_subject + start_offset)
3736 {
3737#ifdef SUPPORT_UNICODE
3738 if (utf)
3739 {
3740 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3741 {
3742 start_match++;
3743 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3744 }
3745 }
3746 else
3747#endif
3748 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3749 start_match++;
3750
3751 /* If we have just passed a CR and the newline option is ANY or
3752 ANYCRLF, and we are now at a LF, advance the match position by one
3753 more code unit. */
3754
3755 if (start_match[-1] == CHAR_CR &&
3756 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3757 start_match < end_subject &&
3758 UCHAR21TEST(start_match) == CHAR_NL)
3759 start_match++;
3760 }
3761 }
3762
3763 /* If there's no first code unit or a requirement for a multiline line
3764 start, advance to a non-unique first code unit if any have been
3765 identified. The bitmap contains only 256 bits. When code units are 16 or
3766 32 bits wide, all code units greater than 254 set the 255 bit. */
3767
3768 else if (start_bits != NULL)
3769 {
3770 while (start_match < end_subject)
3771 {
3772 uint32_t c = UCHAR21TEST(start_match);
3773#if PCRE2_CODE_UNIT_WIDTH != 8
3774 if (c > 255) c = 255;
3775#endif
3776 if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
3777 start_match++;
3778 }
3779
3780 /* See comment above in first_cu checking about the next line. */
3781
3782 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3783 start_match >= mb->end_subject)
3784 break;
3785 }
3786 } /* End of first code unit handling */
3787
3788 /* Restore fudged end_subject */
3789
3790 end_subject = mb->end_subject;
3791
3792 /* The following two optimizations are disabled for partial matching. */
3793
3794 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3795 {
3796 PCRE2_SPTR p;
3797
3798 /* The minimum matching length is a lower bound; no actual string of that
3799 length may actually match the pattern. Although the value is, strictly,
3800 in characters, we treat it as code units to avoid spending too much time
3801 in this optimization. */
3802
3803 if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3804
3805 /* If req_cu is set, we know that that code unit must appear in the
3806 subject for the match to succeed. If the first code unit is set, req_cu
3807 must be later in the subject; otherwise the test starts at the match
3808 point. This optimization can save a huge amount of backtracking in
3809 patterns with nested unlimited repeats that aren't going to match.
3810 Writing separate code for cased/caseless versions makes it go faster, as
3811 does using an autoincrement and backing off on a match. As in the case of
3812 the first code unit, using memchr() in the 8-bit library gives a big
3813 speed up. Unlike the first_cu check above, we do not need to call
3814 memchr() twice in the caseless case because we only need to check for the
3815 presence of the character in either case, not find the first occurrence.
3816
3817 The search can be skipped if the code unit was found later than the
3818 current starting point in a previous iteration of the bumpalong loop.
3819
3820 HOWEVER: when the subject string is very, very long, searching to its end
3821 can take a long time, and give bad performance on quite ordinary
3822 patterns. This showed up when somebody was matching something like
3823 /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3824 sufficiently long, but it's worth searching a lot more for unanchored
3825 patterns. */
3826
3827 p = start_match + (has_first_cu? 1:0);
3828 if (has_req_cu && p > req_cu_ptr)
3829 {
3830 PCRE2_SIZE check_length = end_subject - start_match;
3831
3832 if (check_length < REQ_CU_MAX ||
3833 (!anchored && check_length < REQ_CU_MAX * 1000))
3834 {
3835 if (req_cu != req_cu2) /* Caseless */
3836 {
3837#if PCRE2_CODE_UNIT_WIDTH != 8
3838 while (p < end_subject)
3839 {
3840 uint32_t pp = UCHAR21INCTEST(p);
3841 if (pp == req_cu || pp == req_cu2) { p--; break; }
3842 }
3843#else /* 8-bit code units */
3844 PCRE2_SPTR pp = p;
3845 p = memchr(pp, req_cu, end_subject - pp);
3846 if (p == NULL)
3847 {
3848 p = memchr(pp, req_cu2, end_subject - pp);
3849 if (p == NULL) p = end_subject;
3850 }
3851#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
3852 }
3853
3854 /* The caseful case */
3855
3856 else
3857 {
3858#if PCRE2_CODE_UNIT_WIDTH != 8
3859 while (p < end_subject)
3860 {
3861 if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3862 }
3863
3864#else /* 8-bit code units */
3865 p = memchr(p, req_cu, end_subject - p);
3866 if (p == NULL) p = end_subject;
3867#endif
3868 }
3869
3870 /* If we can't find the required code unit, break the matching loop,
3871 forcing a match failure. */
3872
3873 if (p >= end_subject) break;
3874
3875 /* If we have found the required code unit, save the point where we
3876 found it, so that we don't search again next time round the loop if
3877 the start hasn't passed this code unit yet. */
3878
3879 req_cu_ptr = p;
3880 }
3881 }
3882 }
3883 }
3884
3885 /* ------------ End of start of match optimizations ------------ */
3886
3887 /* Give no match if we have passed the bumpalong limit. */
3888
3889 if (start_match > bumpalong_limit) break;
3890
3891 /* OK, now we can do the business */
3892
3893 mb->start_used_ptr = start_match;
3894 mb->last_used_ptr = start_match;
3895 mb->recursive = NULL;
3896
3897 rc = internal_dfa_match(
3898 mb, /* fixed match data */
3899 mb->start_code, /* this subexpression's code */
3900 start_match, /* where we currently are */
3901 start_offset, /* start offset in subject */
3902 match_data->ovector, /* offset vector */
3903 (uint32_t)match_data->oveccount * 2, /* actual size of same */
3904 workspace, /* workspace vector */
3905 (int)wscount, /* size of same */
3906 0, /* function recurse level */
3907 base_recursion_workspace); /* initial workspace for recursion */
3908
3909 /* Anything other than "no match" means we are done, always; otherwise, carry
3910 on only if not anchored. */
3911
3912 if (rc != PCRE2_ERROR_NOMATCH || anchored)
3913 {
3914 if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
3915 {
3916 match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
3917 match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
3918 }
3919 match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
3920 match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
3921 match_data->startchar = (PCRE2_SIZE)(start_match - subject);
3922 match_data->rc = rc;
3923
3924 if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
3925 {
3926 length = CU2BYTES(length + was_zero_terminated);
3927 match_data->subject = match_data->memctl.malloc(length,
3928 match_data->memctl.memory_data);
3929 if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
3930 memcpy((void *)match_data->subject, subject, length);
3931 match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
3932 }
3933 else
3934 {
3935 if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject;
3936 }
3937 goto EXIT;
3938 }
3939
3940 /* Advance to the next subject character unless we are at the end of a line
3941 and firstline is set. */
3942
3943 if (firstline && IS_NEWLINE(start_match)) break;
3944 start_match++;
3945#ifdef SUPPORT_UNICODE
3946 if (utf)
3947 {
3948 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3949 }
3950#endif
3951 if (start_match > end_subject) break;
3952
3953 /* If we have just passed a CR and we are now at a LF, and the pattern does
3954 not contain any explicit matches for \r or \n, and the newline option is CRLF
3955 or ANY or ANYCRLF, advance the match position by one more character. */
3956
3957 if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
3958 start_match < end_subject &&
3959 UCHAR21TEST(start_match) == CHAR_NL &&
3960 (re->flags & PCRE2_HASCRORLF) == 0 &&
3961 (mb->nltype == NLTYPE_ANY ||
3962 mb->nltype == NLTYPE_ANYCRLF ||
3963 mb->nllen == 2))
3964 start_match++;
3965
3966 } /* "Bumpalong" loop */
3967
3968NOMATCH_EXIT:
3969rc = PCRE2_ERROR_NOMATCH;
3970
3971EXIT:
3972while (rws->next != NULL)
3973 {
3974 RWS_anchor *next = rws->next;
3975 rws->next = next->next;
3976 mb->memctl.free(next, mb->memctl.memory_data);
3977 }
3978
3979return rc;
3980}
3981
3982/* End of pcre2_dfa_match.c */
3983