pcre2_dfa_match.c source code [Godot/thirdparty/pcre2/src/pcre2_dfa_match.c]

1	/*************************************************
2	* Perl-Compatible Regular Expressions *
3	*************************************************/
4
5	/ PCRE is a library of functions to support regular expressions whose syntax*
6	and semantics are as close as possible to those of the Perl 5 language.
7
8	Written by Philip Hazel
9	Original API code Copyright (c) 1997-2012 University of Cambridge
10	New API code Copyright (c) 2016-2022 University of Cambridge
11
12	-----------------------------------------------------------------------------
13	Redistribution and use in source and binary forms, with or without
14	modification, are permitted provided that the following conditions are met:
15
16	* Redistributions of source code must retain the above copyright notice,
17	this list of conditions and the following disclaimer.
18
19	* Redistributions in binary form must reproduce the above copyright
20	notice, this list of conditions and the following disclaimer in the
21	documentation and/or other materials provided with the distribution.
22
23	* Neither the name of the University of Cambridge nor the names of its
24	contributors may be used to endorse or promote products derived from
25	this software without specific prior written permission.
26
27	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37	POSSIBILITY OF SUCH DAMAGE.
38	-----------------------------------------------------------------------------
39	*/
40
41
42	/ This module contains the external function pcre2_dfa_match(), which is an*
43	alternative matching function that uses a sort of DFA algorithm (not a true
44	FSM). This is NOT Perl-compatible, but it has advantages in certain
45	applications. /*
46
47
48	/ NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved*
49	the performance of his patterns greatly. I could not use it as it stood, as it
50	was not thread safe, and made assumptions about pattern sizes. Also, it caused
51	test 7 to loop, and test 9 to crash with a segfault.
52
53	The issue is the check for duplicate states, which is done by a simple linear
54	search up the state list. (Grep for "duplicate" below to find the code.) For
55	many patterns, there will never be many states active at one time, so a simple
56	linear search is fine. In patterns that have many active states, it might be a
57	bottleneck. The suggested code used an indexing scheme to remember which states
58	had previously been used for each character, and avoided the linear search when
59	it knew there was no chance of a duplicate. This was implemented when adding
60	states to the state lists.
61
62	I wrote some thread-safe, not-limited code to try something similar at the time
63	of checking for duplicates (instead of when adding states), using index vectors
64	on the stack. It did give a 13% improvement with one specially constructed
65	pattern for certain subject strings, but on other strings and on many of the
66	simpler patterns in the test suite it did worse. The major problem, I think,
67	was the extra time to initialize the index. This had to be done for each call
68	of internal_dfa_match(). (The supplied patch used a static vector, initialized
69	only once - I suspect this was the cause of the problems with the tests.)
70
71	Overall, I concluded that the gains in some cases did not outweigh the losses
72	in others, so I abandoned this code. /*
73
74
75	#ifdef HAVE_CONFIG_H
76	#include "config.h"
77	#endif
78
79	#define NLBLOCK mb /* Block containing newline information */
80	#define PSSTART start_subject /* Field containing processed string start */
81	#define PSEND end_subject /* Field containing processed string end */
82
83	#include "pcre2_internal.h"
84
85	#define PUBLIC_DFA_MATCH_OPTIONS \
86	(PCRE2_ANCHORED\|PCRE2_ENDANCHORED\|PCRE2_NOTBOL\|PCRE2_NOTEOL\|PCRE2_NOTEMPTY\| \
87	PCRE2_NOTEMPTY_ATSTART\|PCRE2_NO_UTF_CHECK\|PCRE2_PARTIAL_HARD\| \
88	PCRE2_PARTIAL_SOFT\|PCRE2_DFA_SHORTEST\|PCRE2_DFA_RESTART\| \
89	PCRE2_COPY_MATCHED_SUBJECT)
90
91
92	/*************************************************
93	* Code parameters and static tables *
94	*************************************************/
95
96	/ These are offsets that are used to turn the OP_TYPESTAR and friends opcodes*
97	into others, under special conditions. A gap of 20 between the blocks should be
98	enough. The resulting opcodes don't have to be less than 256 because they are
99	never stored, so we push them well clear of the normal opcodes. /*
100
101	#define OP_PROP_EXTRA 300
102	#define OP_EXTUNI_EXTRA 320
103	#define OP_ANYNL_EXTRA 340
104	#define OP_HSPACE_EXTRA 360
105	#define OP_VSPACE_EXTRA 380
106
107
108	/ This table identifies those opcodes that are followed immediately by a*
109	character that is to be tested in some way. This makes it possible to
110	centralize the loading of these characters. In the case of Type etc, the*
111	"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112	small value. Non-zero values in the table are the offsets from the opcode where
113	the character is to be found. NOTE* If the start of this table is*
114	modified, the three tables that follow must also be modified. /*
115
116	static const uint8_t coptable[] = {
117	`0`, / End /
118	`0`, `0`, `0`, `0`, `0`, / \A, \G, \K, \B, \b /
119	`0`, `0`, `0`, `0`, `0`, `0`, / \D, \d, \S, \s, \W, \w /
120	`0`, `0`, `0`, / Any, AllAny, Anybyte /
121	`0`, `0`, / \P, \p /
122	`0`, `0`, `0`, `0`, `0`, / \R, \H, \h, \V, \v /
123	`0`, / \X /
124	`0`, `0`, `0`, `0`, `0`, `0`, / \Z, \z, $, $M, ^, ^M /
125	`1`, / Char /
126	`1`, / Chari /
127	`1`, / not /
128	`1`, / noti /
129	/ Positive single-char repeats /
130	`1`, `1`, `1`, `1`, `1`, `1`, / , ?, +, +?, ?, ?? /
131	`1`+IMM2_SIZE, `1`+IMM2_SIZE, / upto, minupto /
132	`1`+IMM2_SIZE, / exact /
133	`1`, `1`, `1`, `1`+IMM2_SIZE, / +, ++, ?+, upto+ /*
134	`1`, `1`, `1`, `1`, `1`, `1`, / I, ?I, +I, +?I, ?I, ??I /
135	`1`+IMM2_SIZE, `1`+IMM2_SIZE, / upto I, minupto I /
136	`1`+IMM2_SIZE, / exact I /
137	`1`, `1`, `1`, `1`+IMM2_SIZE, / +I, ++I, ?+I, upto+I /*
138	/ Negative single-char repeats - only for chars < 256 /
139	`1`, `1`, `1`, `1`, `1`, `1`, / NOT , ?, +, +?, ?, ?? /
140	`1`+IMM2_SIZE, `1`+IMM2_SIZE, / NOT upto, minupto /
141	`1`+IMM2_SIZE, / NOT exact /
142	`1`, `1`, `1`, `1`+IMM2_SIZE, / NOT +, ++, ?+, upto+ /*
143	`1`, `1`, `1`, `1`, `1`, `1`, / NOT I, ?I, +I, +?I, ?I, ??I /
144	`1`+IMM2_SIZE, `1`+IMM2_SIZE, / NOT upto I, minupto I /
145	`1`+IMM2_SIZE, / NOT exact I /
146	`1`, `1`, `1`, `1`+IMM2_SIZE, / NOT +I, ++I, ?+I, upto+I /*
147	/ Positive type repeats /
148	`1`, `1`, `1`, `1`, `1`, `1`, / Type , ?, +, +?, ?, ?? /
149	`1`+IMM2_SIZE, `1`+IMM2_SIZE, / Type upto, minupto /
150	`1`+IMM2_SIZE, / Type exact /
151	`1`, `1`, `1`, `1`+IMM2_SIZE, / Type +, ++, ?+, upto+ /*
152	/ Character class & ref repeats /
153	`0`, `0`, `0`, `0`, `0`, `0`, / , ?, +, +?, ?, ?? /
154	`0`, `0`, / CRRANGE, CRMINRANGE /
155	`0`, `0`, `0`, `0`, / Possessive +, ++, ?+, CRPOSRANGE /*
156	`0`, / CLASS /
157	`0`, / NCLASS /
158	`0`, / XCLASS - variable length /
159	`0`, / REF /
160	`0`, / REFI /
161	`0`, / DNREF /
162	`0`, / DNREFI /
163	`0`, / RECURSE /
164	`0`, / CALLOUT /
165	`0`, / CALLOUT_STR /
166	`0`, / Alt /
167	`0`, / Ket /
168	`0`, / KetRmax /
169	`0`, / KetRmin /
170	`0`, / KetRpos /
171	`0`, / Reverse /
172	`0`, / Assert /
173	`0`, / Assert not /
174	`0`, / Assert behind /
175	`0`, / Assert behind not /
176	`0`, / NA assert /
177	`0`, / NA assert behind /
178	`0`, / ONCE /
179	`0`, / SCRIPT_RUN /
180	`0`, `0`, `0`, `0`, `0`, / BRA, BRAPOS, CBRA, CBRAPOS, COND /
181	`0`, `0`, `0`, `0`, `0`, / SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND /
182	`0`, `0`, / CREF, DNCREF /
183	`0`, `0`, / RREF, DNRREF /
184	`0`, `0`, / FALSE, TRUE /
185	`0`, `0`, `0`, / BRAZERO, BRAMINZERO, BRAPOSZERO /
186	`0`, `0`, `0`, / MARK, PRUNE, PRUNE_ARG /
187	`0`, `0`, `0`, `0`, / SKIP, SKIP_ARG, THEN, THEN_ARG /
188	`0`, `0`, / COMMIT, COMMIT_ARG /
189	`0`, `0`, `0`, / FAIL, ACCEPT, ASSERT_ACCEPT /
190	`0`, `0`, `0` / CLOSE, SKIPZERO, DEFINE /
191	};
192
193	/ This table identifies those opcodes that inspect a character. It is used to*
194	remember the fact that a character could have been inspected when the end of
195	the subject is reached. NOTE* If the start of this table is modified, the*
196	two tables that follow must also be modified. /*
197
198	static const uint8_t poptable[] = {
199	`0`, / End /
200	`0`, `0`, `0`, `1`, `1`, / \A, \G, \K, \B, \b /
201	`1`, `1`, `1`, `1`, `1`, `1`, / \D, \d, \S, \s, \W, \w /
202	`1`, `1`, `1`, / Any, AllAny, Anybyte /
203	`1`, `1`, / \P, \p /
204	`1`, `1`, `1`, `1`, `1`, / \R, \H, \h, \V, \v /
205	`1`, / \X /
206	`0`, `0`, `0`, `0`, `0`, `0`, / \Z, \z, $, $M, ^, ^M /
207	`1`, / Char /
208	`1`, / Chari /
209	`1`, / not /
210	`1`, / noti /
211	/ Positive single-char repeats /
212	`1`, `1`, `1`, `1`, `1`, `1`, / , ?, +, +?, ?, ?? /
213	`1`, `1`, `1`, / upto, minupto, exact /
214	`1`, `1`, `1`, `1`, / +, ++, ?+, upto+ /*
215	`1`, `1`, `1`, `1`, `1`, `1`, / I, ?I, +I, +?I, ?I, ??I /
216	`1`, `1`, `1`, / upto I, minupto I, exact I /
217	`1`, `1`, `1`, `1`, / +I, ++I, ?+I, upto+I /*
218	/ Negative single-char repeats - only for chars < 256 /
219	`1`, `1`, `1`, `1`, `1`, `1`, / NOT , ?, +, +?, ?, ?? /
220	`1`, `1`, `1`, / NOT upto, minupto, exact /
221	`1`, `1`, `1`, `1`, / NOT +, ++, ?+, upto+ /*
222	`1`, `1`, `1`, `1`, `1`, `1`, / NOT I, ?I, +I, +?I, ?I, ??I /
223	`1`, `1`, `1`, / NOT upto I, minupto I, exact I /
224	`1`, `1`, `1`, `1`, / NOT +I, ++I, ?+I, upto+I /*
225	/ Positive type repeats /
226	`1`, `1`, `1`, `1`, `1`, `1`, / Type , ?, +, +?, ?, ?? /
227	`1`, `1`, `1`, / Type upto, minupto, exact /
228	`1`, `1`, `1`, `1`, / Type +, ++, ?+, upto+ /*
229	/ Character class & ref repeats /
230	`1`, `1`, `1`, `1`, `1`, `1`, / , ?, +, +?, ?, ?? /
231	`1`, `1`, / CRRANGE, CRMINRANGE /
232	`1`, `1`, `1`, `1`, / Possessive +, ++, ?+, CRPOSRANGE /*
233	`1`, / CLASS /
234	`1`, / NCLASS /
235	`1`, / XCLASS - variable length /
236	`0`, / REF /
237	`0`, / REFI /
238	`0`, / DNREF /
239	`0`, / DNREFI /
240	`0`, / RECURSE /
241	`0`, / CALLOUT /
242	`0`, / CALLOUT_STR /
243	`0`, / Alt /
244	`0`, / Ket /
245	`0`, / KetRmax /
246	`0`, / KetRmin /
247	`0`, / KetRpos /
248	`0`, / Reverse /
249	`0`, / Assert /
250	`0`, / Assert not /
251	`0`, / Assert behind /
252	`0`, / Assert behind not /
253	`0`, / NA assert /
254	`0`, / NA assert behind /
255	`0`, / ONCE /
256	`0`, / SCRIPT_RUN /
257	`0`, `0`, `0`, `0`, `0`, / BRA, BRAPOS, CBRA, CBRAPOS, COND /
258	`0`, `0`, `0`, `0`, `0`, / SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND /
259	`0`, `0`, / CREF, DNCREF /
260	`0`, `0`, / RREF, DNRREF /
261	`0`, `0`, / FALSE, TRUE /
262	`0`, `0`, `0`, / BRAZERO, BRAMINZERO, BRAPOSZERO /
263	`0`, `0`, `0`, / MARK, PRUNE, PRUNE_ARG /
264	`0`, `0`, `0`, `0`, / SKIP, SKIP_ARG, THEN, THEN_ARG /
265	`0`, `0`, / COMMIT, COMMIT_ARG /
266	`0`, `0`, `0`, / FAIL, ACCEPT, ASSERT_ACCEPT /
267	`0`, `0`, `0` / CLOSE, SKIPZERO, DEFINE /
268	};
269
270	/ These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,*
271	and \w /*
272
273	static const uint8_t toptable1[] = {
274	`0`, `0`, `0`, `0`, `0`, `0`,
275	ctype_digit, ctype_digit,
276	ctype_space, ctype_space,
277	ctype_word, ctype_word,
278	`0`, `0` / OP_ANY, OP_ALLANY /
279	};
280
281	static const uint8_t toptable2[] = {
282	`0`, `0`, `0`, `0`, `0`, `0`,
283	ctype_digit, `0`,
284	ctype_space, `0`,
285	ctype_word, `0`,
286	`1`, `1` / OP_ANY, OP_ALLANY /
287	};
288
289
290	/ Structure for holding data about a particular state, which is in effect the*
291	current data for an active path through the match tree. It must consist
292	entirely of ints because the working vector we are passed, and which we put
293	these structures in, is a vector of ints. /*
294
295	typedef struct stateblock {
296	int offset; / Offset to opcode (-ve has meaning) /
297	int count; / Count for repeats /
298	int data; / Some use extra data /
299	} stateblock;
300
301	#define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
302
303
304	/ Before version 10.32 the recursive calls of internal_dfa_match() were passed*
305	local working space and output vectors that were created on the stack. This has
306	caused issues for some patterns, especially in small-stack environments such as
307	Windows. A new scheme is now in use which sets up a vector on the stack, but if
308	this is too small, heap memory is used, up to the heap_limit. The main
309	parameters are all numbers of ints because the workspace is a vector of ints.
310
311	The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
312	defined in pcre2_internal.h so as to be available to pcre2test when it is
313	finding the minimum heap requirement for a match. /*
314
315	#define OVEC_UNIT (sizeof(PCRE2_SIZE)/sizeof(int))
316
317	#define RWS_BASE_SIZE (DFA_START_RWS_SIZE/sizeof(int)) /* Stack vector */
318	#define RWS_RSIZE 1000 /* Work size for recursion */
319	#define RWS_OVEC_RSIZE (1000OVEC_UNIT) / Ovector for recursion */
320	#define RWS_OVEC_OSIZE (2OVEC_UNIT) / Ovector in other cases */
321
322	/ This structure is at the start of each workspace block. /
323
324	typedef struct RWS_anchor {
325	struct RWS_anchor *next;
326	uint32_t size; / Number of ints /
327	uint32_t free; / Number of ints /
328	} RWS_anchor;
329
330	#define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
331
332
333
334	/*************************************************
335	* Process a callout *
336	*************************************************/
337
338	/ This function is called to perform a callout.*
339
340	Arguments:
341	code current code pointer
342	offsets points to current capture offsets
343	current_subject start of current subject match
344	ptr current position in subject
345	mb the match block
346	extracode extra code offset when called from condition
347	lengthptr where to return the callout length
348
349	Returns: the return from the callout
350	*/
351
352	static int
353	do_callout_dfa(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
354	PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
355	PCRE2_SIZE *lengthptr)
356	{
357	pcre2_callout_block *cb = mb->cb;
358
359	*lengthptr = (code[extracode] == OP_CALLOUT)?
360	(PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
361	(PCRE2_SIZE)GET(code, `1` + `2`*LINK_SIZE + extracode);
362
363	if (mb->callout == NULL) return `0`; / No callout provided /
364
365	/ Fixed fields in the callout block are set once and for all at the start of*
366	matching. /*
367
368	cb->offset_vector = offsets;
369	cb->start_match = (PCRE2_SIZE)(current_subject - mb->start_subject);
370	cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
371	cb->pattern_position = GET(code, `1` + extracode);
372	cb->next_item_length = GET(code, `1` + LINK_SIZE + extracode);
373
374	if (code[extracode] == OP_CALLOUT)
375	{
376	cb->callout_number = code[`1` + `2`*LINK_SIZE + extracode];
377	cb->callout_string_offset = `0`;
378	cb->callout_string = NULL;
379	cb->callout_string_length = `0`;
380	}
381	else
382	{
383	cb->callout_number = `0`;
384	cb->callout_string_offset = GET(code, `1` + `3`*LINK_SIZE + extracode);
385	cb->callout_string = code + (`1` + `4`*LINK_SIZE + extracode) + `1`;
386	cb->callout_string_length = lengthptr - (`1` + `4`LINK_SIZE) - `2`;
387	}
388
389	return (mb->callout)(cb, mb->callout_data);
390	}
391
392
393
394	/*************************************************
395	* Expand local workspace memory *
396	*************************************************/
397
398	/ This function is called when internal_dfa_match() is about to be called*
399	recursively and there is insufficient working space left in the current
400	workspace block. If there's an existing next block, use it; otherwise get a new
401	block unless the heap limit is reached.
402
403	Arguments:
404	rwsptr pointer to block pointer (updated)
405	ovecsize space needed for an ovector
406	mb the match block
407
408	Returns: 0 rwsptr has been updated
409	!0 an error code
410	*/
411
412	static int
413	more_workspace(RWS_anchor *rwsptr, unsigned* int ovecsize, dfa_match_block *mb)
414	{
415	RWS_anchor rws = rwsptr;
416	RWS_anchor *new;
417
418	if (rws->next != NULL)
419	{
420	new = rws->next;
421	}
422
423	/ Sizes in the RWS_anchor blocks are in units of sizeof(int), but*
424	mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid
425	overflow. /*
426
427	else
428	{
429	uint32_t newsize = (rws->size >= UINT32_MAX/`2`)? UINT32_MAX/`2` : rws->size * `2`;
430	uint32_t newsizeK = newsize/(`1024`/sizeof(int));
431
432	if (newsizeK + mb->heap_used > mb->heap_limit)
433	newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used);
434	newsize = newsizeK(`1024`/sizeof(int*));
435
436	if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
437	return PCRE2_ERROR_HEAPLIMIT;
438	new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
439	if (new == NULL) return PCRE2_ERROR_NOMEMORY;
440	mb->heap_used += newsizeK;
441	new->next = NULL;
442	new->size = newsize;
443	rws->next = new;
444	}
445
446	new->free = new->size - RWS_ANCHOR_SIZE;
447	*rwsptr = new;
448	return `0`;
449	}
450
451
452
453	/*************************************************
454	* Match a Regular Expression - DFA engine *
455	*************************************************/
456
457	/ This internal function applies a compiled pattern to a subject string,*
458	starting at a given point, using a DFA engine. This function is called from the
459	external one, possibly multiple times if the pattern is not anchored. The
460	function calls itself recursively for some kinds of subpattern.
461
462	Arguments:
463	mb the match_data block with fixed information
464	this_start_code the opening bracket of this subexpression's code
465	current_subject where we currently are in the subject string
466	start_offset start offset in the subject string
467	offsets vector to contain the matching string offsets
468	offsetcount size of same
469	workspace vector of workspace
470	wscount size of same
471	rlevel function call recursion level
472
473	Returns: > 0 => number of match offset pairs placed in offsets
474	= 0 => offsets overflowed; longest matches are present
475	-1 => failed to match
476	< -1 => some kind of unexpected problem
477
478	The following macros are used for adding states to the two state vectors (one
479	for the current character, one for the following character). /*
480
481	#define ADD_ACTIVE(x,y) \
482	if (active_count++ < wscount) \
483	{ \
484	next_active_state->offset = (x); \
485	next_active_state->count = (y); \
486	next_active_state++; \
487	} \
488	else return PCRE2_ERROR_DFA_WSSIZE
489
490	#define ADD_ACTIVE_DATA(x,y,z) \
491	if (active_count++ < wscount) \
492	{ \
493	next_active_state->offset = (x); \
494	next_active_state->count = (y); \
495	next_active_state->data = (z); \
496	next_active_state++; \
497	} \
498	else return PCRE2_ERROR_DFA_WSSIZE
499
500	#define ADD_NEW(x,y) \
501	if (new_count++ < wscount) \
502	{ \
503	next_new_state->offset = (x); \
504	next_new_state->count = (y); \
505	next_new_state++; \
506	} \
507	else return PCRE2_ERROR_DFA_WSSIZE
508
509	#define ADD_NEW_DATA(x,y,z) \
510	if (new_count++ < wscount) \
511	{ \
512	next_new_state->offset = (x); \
513	next_new_state->count = (y); \
514	next_new_state->data = (z); \
515	next_new_state++; \
516	} \
517	else return PCRE2_ERROR_DFA_WSSIZE
518
519	/ And now, here is the code /
520
521	static int
522	internal_dfa_match(
523	dfa_match_block *mb,
524	PCRE2_SPTR this_start_code,
525	PCRE2_SPTR current_subject,
526	PCRE2_SIZE start_offset,
527	PCRE2_SIZE *offsets,
528	uint32_t offsetcount,
529	int *workspace,
530	int wscount,
531	uint32_t rlevel,
532	int *RWS)
533	{
534	stateblock active_states, new_states, *temp_states;
535	stateblock next_active_state, next_new_state;
536	const uint8_t ctypes, lcc, *fcc;
537	PCRE2_SPTR ptr;
538	PCRE2_SPTR end_code;
539	dfa_recursion_info new_recursive;
540	int active_count, new_count, match_count;
541
542	/ Some fields in the mb block are frequently referenced, so we load them into*
543	independent variables in the hope that this will perform better. /*
544
545	PCRE2_SPTR start_subject = mb->start_subject;
546	PCRE2_SPTR end_subject = mb->end_subject;
547	PCRE2_SPTR start_code = mb->start_code;
548
549	#ifdef SUPPORT_UNICODE
550	BOOL utf = (mb->poptions & PCRE2_UTF) != `0`;
551	BOOL utf_or_ucp = utf \|\| (mb->poptions & PCRE2_UCP) != `0`;
552	#else
553	BOOL utf = FALSE;
554	#endif
555
556	BOOL reset_could_continue = FALSE;
557
558	if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
559	if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
560	offsetcount &= (uint32_t)(-`2`); / Round down /
561
562	wscount -= `2`;
563	wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * `2`))) /
564	(`2` * INTS_PER_STATEBLOCK);
565
566	ctypes = mb->tables + ctypes_offset;
567	lcc = mb->tables + lcc_offset;
568	fcc = mb->tables + fcc_offset;
569
570	match_count = PCRE2_ERROR_NOMATCH; / A negative number /
571
572	active_states = (stateblock *)(workspace + `2`);
573	next_new_state = new_states = active_states + wscount;
574	new_count = `0`;
575
576	/ The first thing in any (sub) pattern is a bracket of some sort. Push all*
577	the alternative states onto the list, and find out where the end is. This
578	makes is possible to use this function recursively, when we want to stop at a
579	matching internal ket rather than at the end.
580
581	If we are dealing with a backward assertion we have to find out the maximum
582	amount to move back, and set up each alternative appropriately. /*
583
584	if (this_start_code == OP_ASSERTBACK \|\| this_start_code == OP_ASSERTBACK_NOT)
585	{
586	size_t max_back = `0`;
587	size_t gone_back;
588
589	end_code = this_start_code;
590	do
591	{
592	size_t back = (size_t)GET(end_code, `2`+LINK_SIZE);
593	if (back > max_back) max_back = back;
594	end_code += GET(end_code, `1`);
595	}
596	while (*end_code == OP_ALT);
597
598	/ If we can't go back the amount required for the longest lookbehind*
599	pattern, go back as far as we can; some alternatives may still be viable. /*
600
601	#ifdef SUPPORT_UNICODE
602	/ In character mode we have to step back character by character /
603
604	if (utf)
605	{
606	for (gone_back = `0`; gone_back < max_back; gone_back++)
607	{
608	if (current_subject <= start_subject) break;
609	current_subject--;
610	ACROSSCHAR(current_subject > start_subject, current_subject,
611	current_subject--);
612	}
613	}
614	else
615	#endif
616
617	/ In byte-mode we can do this quickly. /
618
619	{
620	size_t current_offset = (size_t)(current_subject - start_subject);
621	gone_back = (current_offset < max_back)? current_offset : max_back;
622	current_subject -= gone_back;
623	}
624
625	/ Save the earliest consulted character /
626
627	if (current_subject < mb->start_used_ptr)
628	mb->start_used_ptr = current_subject;
629
630	/ Now we can process the individual branches. There will be an OP_REVERSE at*
631	the start of each branch, except when the length of the branch is zero. /*
632
633	end_code = this_start_code;
634	do
635	{
636	uint32_t revlen = (end_code[`1`+LINK_SIZE] == OP_REVERSE)? `1` + LINK_SIZE : `0`;
637	size_t back = (revlen == `0`)? `0` : (size_t)GET(end_code, `2`+LINK_SIZE);
638	if (back <= gone_back)
639	{
640	int bstate = (int)(end_code - start_code + `1` + LINK_SIZE + revlen);
641	ADD_NEW_DATA(-bstate, `0`, (int)(gone_back - back));
642	}
643	end_code += GET(end_code, `1`);
644	}
645	while (*end_code == OP_ALT);
646	}
647
648	/ This is the code for a "normal" subpattern (not a backward assertion). The*
649	start of a whole pattern is always one of these. If we are at the top level,
650	we may be asked to restart matching from the same point that we reached for a
651	previous partial match. We still have to scan through the top-level branches to
652	find the end state. /*
653
654	else
655	{
656	end_code = this_start_code;
657
658	/ Restarting /
659
660	if (rlevel == `1` && (mb->moptions & PCRE2_DFA_RESTART) != `0`)
661	{
662	do { end_code += GET(end_code, `1`); } while (*end_code == OP_ALT);
663	new_count = workspace[`1`];
664	if (!workspace[`0`])
665	memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
666	}
667
668	/ Not restarting /
669
670	else
671	{
672	int length = `1` + LINK_SIZE +
673	((this_start_code == OP_CBRA \|\| this_start_code == OP_SCBRA \|\|
674	this_start_code == OP_CBRAPOS \|\| this_start_code == OP_SCBRAPOS)
675	? IMM2_SIZE:`0`);
676	do
677	{
678	ADD_NEW((int)(end_code - start_code + length), `0`);
679	end_code += GET(end_code, `1`);
680	length = `1` + LINK_SIZE;
681	}
682	while (*end_code == OP_ALT);
683	}
684	}
685
686	workspace[`0`] = `0`; / Bit indicating which vector is current /
687
688	/ Loop for scanning the subject /
689
690	ptr = current_subject;
691	for (;;)
692	{
693	int i, j;
694	int clen, dlen;
695	uint32_t c, d;
696	int forced_fail = `0`;
697	BOOL partial_newline = FALSE;
698	BOOL could_continue = reset_could_continue;
699	reset_could_continue = FALSE;
700
701	if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
702
703	/ Make the new state list into the active state list and empty the*
704	new state list. /*
705
706	temp_states = active_states;
707	active_states = new_states;
708	new_states = temp_states;
709	active_count = new_count;
710	new_count = `0`;
711
712	workspace[`0`] ^= `1`; / Remember for the restarting feature /
713	workspace[`1`] = active_count;
714
715	/ Set the pointers for adding new states /
716
717	next_active_state = active_states + active_count;
718	next_new_state = new_states;
719
720	/ Load the current character from the subject outside the loop, as many*
721	different states may want to look at it, and we assume that at least one
722	will. /*
723
724	if (ptr < end_subject)
725	{
726	clen = `1`; / Number of data items in the character /
727	#ifdef SUPPORT_UNICODE
728	GETCHARLENTEST(c, ptr, clen);
729	#else
730	c = *ptr;
731	#endif /* SUPPORT_UNICODE */
732	}
733	else
734	{
735	clen = `0`; / This indicates the end of the subject /
736	c = NOTACHAR; / This value should never actually be used /
737	}
738
739	/ Scan up the active states and act on each one. The result of an action*
740	may be to add more states to the currently active list (e.g. on hitting a
741	parenthesis) or it may be to put states on the new list, for considering
742	when we move the character pointer on. /*
743
744	for (i = `0`; i < active_count; i++)
745	{
746	stateblock *current_state = active_states + i;
747	BOOL caseless = FALSE;
748	PCRE2_SPTR code;
749	uint32_t codevalue;
750	int state_offset = current_state->offset;
751	int rrc;
752	int count;
753
754	/ A negative offset is a special case meaning "hold off going to this*
755	(negated) state until the number of characters in the data field have
756	been skipped". If the could_continue flag was passed over from a previous
757	state, arrange for it to passed on. /*
758
759	if (state_offset < `0`)
760	{
761	if (current_state->data > `0`)
762	{
763	ADD_NEW_DATA(state_offset, current_state->count,
764	current_state->data - `1`);
765	if (could_continue) reset_could_continue = TRUE;
766	continue;
767	}
768	else
769	{
770	current_state->offset = state_offset = -state_offset;
771	}
772	}
773
774	/ Check for a duplicate state with the same count, and skip if found.*
775	See the note at the head of this module about the possibility of improving
776	performance here. /*
777
778	for (j = `0`; j < i; j++)
779	{
780	if (active_states[j].offset == state_offset &&
781	active_states[j].count == current_state->count)
782	goto NEXT_ACTIVE_STATE;
783	}
784
785	/ The state offset is the offset to the opcode /
786
787	code = start_code + state_offset;
788	codevalue = *code;
789
790	/ If this opcode inspects a character, but we are at the end of the*
791	subject, remember the fact for use when testing for a partial match. /*
792
793	if (clen == `0` && poptable[codevalue] != `0`)
794	could_continue = TRUE;
795
796	/ If this opcode is followed by an inline character, load it. It is*
797	tempting to test for the presence of a subject character here, but that
798	is wrong, because sometimes zero repetitions of the subject are
799	permitted.
800
801	We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
802	argument that is not a data character - but is always one byte long because
803	the values are small. We have to take special action to deal with \P, \p,
804	\H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
805	these ones to new opcodes. /*
806
807	if (coptable[codevalue] > `0`)
808	{
809	dlen = `1`;
810	#ifdef SUPPORT_UNICODE
811	if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
812	#endif /* SUPPORT_UNICODE */
813	d = code[coptable[codevalue]];
814	if (codevalue >= OP_TYPESTAR)
815	{
816	switch(d)
817	{
818	case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
819	case OP_NOTPROP:
820	case OP_PROP: codevalue += OP_PROP_EXTRA; break;
821	case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
822	case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
823	case OP_NOT_HSPACE:
824	case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
825	case OP_NOT_VSPACE:
826	case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
827	default: break;
828	}
829	}
830	}
831	else
832	{
833	dlen = `0`; / Not strictly necessary, but compilers moan /
834	d = NOTACHAR; / if these variables are not set. /
835	}
836
837
838	/ Now process the individual opcodes /
839
840	switch (codevalue)
841	{
842	/ ========================================================================== /
843	/ These cases are never obeyed. This is a fudge that causes a compile-*
844	time error if the vectors coptable or poptable, which are indexed by
845	opcode, are not the correct length. It seems to be the only way to do
846	such a check at compile time, as the sizeof() operator does not work
847	in the C preprocessor. /*
848
849	case OP_TABLE_LENGTH:
850	case OP_TABLE_LENGTH +
851	((sizeof(coptable) == OP_TABLE_LENGTH) &&
852	(sizeof(poptable) == OP_TABLE_LENGTH)):
853	return `0`;
854
855	/ ========================================================================== /
856	/ Reached a closing bracket. If not at the end of the pattern, carry*
857	on with the next opcode. For repeating opcodes, also add the repeat
858	state. Note that KETRPOS will always be encountered at the end of the
859	subpattern, because the possessive subpattern repeats are always handled
860	using recursive calls. Thus, it never adds any new states.
861
862	At the end of the (sub)pattern, unless we have an empty string and
863	PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
864	start of the subject, save the match data, shifting up all previous
865	matches so we always have the longest first. /*
866
867	case OP_KET:
868	case OP_KETRMIN:
869	case OP_KETRMAX:
870	case OP_KETRPOS:
871	if (code != end_code)
872	{
873	ADD_ACTIVE(state_offset + `1` + LINK_SIZE, `0`);
874	if (codevalue != OP_KET)
875	{
876	ADD_ACTIVE(state_offset - (int)GET(code, `1`), `0`);
877	}
878	}
879	else
880	{
881	if (ptr > current_subject \|\|
882	((mb->moptions & PCRE2_NOTEMPTY) == `0` &&
883	((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == `0` \|\|
884	current_subject > start_subject + mb->start_offset)))
885	{
886	if (match_count < `0`) match_count = (offsetcount >= `2`)? `1` : `0`;
887	else if (match_count > `0` && ++match_count * `2` > (int)offsetcount)
888	match_count = `0`;
889	count = ((match_count == `0`)? (int)offsetcount : match_count * `2`) - `2`;
890	if (count > `0`) (void)memmove(offsets + `2`, offsets,
891	(size_t)count * sizeof(PCRE2_SIZE));
892	if (offsetcount >= `2`)
893	{
894	offsets[`0`] = (PCRE2_SIZE)(current_subject - start_subject);
895	offsets[`1`] = (PCRE2_SIZE)(ptr - start_subject);
896	}
897	if ((mb->moptions & PCRE2_DFA_SHORTEST) != `0`) return match_count;
898	}
899	}
900	break;
901
902	/ ========================================================================== /
903	/ These opcodes add to the current list of states without looking*
904	at the current character. /*
905
906	/-----------------------------------------------------------------/
907	case OP_ALT:
908	do { code += GET(code, `1`); } while (*code == OP_ALT);
909	ADD_ACTIVE((int)(code - start_code), `0`);
910	break;
911
912	/-----------------------------------------------------------------/
913	case OP_BRA:
914	case OP_SBRA:
915	do
916	{
917	ADD_ACTIVE((int)(code - start_code + `1` + LINK_SIZE), `0`);
918	code += GET(code, `1`);
919	}
920	while (*code == OP_ALT);
921	break;
922
923	/-----------------------------------------------------------------/
924	case OP_CBRA:
925	case OP_SCBRA:
926	ADD_ACTIVE((int)(code - start_code + `1` + LINK_SIZE + IMM2_SIZE), `0`);
927	code += GET(code, `1`);
928	while (*code == OP_ALT)
929	{
930	ADD_ACTIVE((int)(code - start_code + `1` + LINK_SIZE), `0`);
931	code += GET(code, `1`);
932	}
933	break;
934
935	/-----------------------------------------------------------------/
936	case OP_BRAZERO:
937	case OP_BRAMINZERO:
938	ADD_ACTIVE(state_offset + `1`, `0`);
939	code += `1` + GET(code, `2`);
940	while (*code == OP_ALT) code += GET(code, `1`);
941	ADD_ACTIVE((int)(code - start_code + `1` + LINK_SIZE), `0`);
942	break;
943
944	/-----------------------------------------------------------------/
945	case OP_SKIPZERO:
946	code += `1` + GET(code, `2`);
947	while (*code == OP_ALT) code += GET(code, `1`);
948	ADD_ACTIVE((int)(code - start_code + `1` + LINK_SIZE), `0`);
949	break;
950
951	/-----------------------------------------------------------------/
952	case OP_CIRC:
953	if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == `0`)
954	{ ADD_ACTIVE(state_offset + `1`, `0`); }
955	break;
956
957	/-----------------------------------------------------------------/
958	case OP_CIRCM:
959	if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == `0`) \|\|
960	((ptr != end_subject \|\| (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != `0` )
961	&& WAS_NEWLINE(ptr)))
962	{ ADD_ACTIVE(state_offset + `1`, `0`); }
963	break;
964
965	/-----------------------------------------------------------------/
966	case OP_EOD:
967	if (ptr >= end_subject)
968	{
969	if ((mb->moptions & PCRE2_PARTIAL_HARD) != `0`)
970	return PCRE2_ERROR_PARTIAL;
971	else { ADD_ACTIVE(state_offset + `1`, `0`); }
972	}
973	break;
974
975	/-----------------------------------------------------------------/
976	case OP_SOD:
977	if (ptr == start_subject) { ADD_ACTIVE(state_offset + `1`, `0`); }
978	break;
979
980	/-----------------------------------------------------------------/
981	case OP_SOM:
982	if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + `1`, `0`); }
983	break;
984
985
986	/ ========================================================================== /
987	/ These opcodes inspect the next subject character, and sometimes*
988	the previous one as well, but do not have an argument. The variable
989	clen contains the length of the current character and is zero if we are
990	at the end of the subject. /*
991
992	/-----------------------------------------------------------------/
993	case OP_ANY:
994	if (clen > `0` && !IS_NEWLINE(ptr))
995	{
996	if (ptr + `1` >= mb->end_subject &&
997	(mb->moptions & (PCRE2_PARTIAL_HARD)) != `0` &&
998	NLBLOCK->nltype == NLTYPE_FIXED &&
999	NLBLOCK->nllen == `2` &&
1000	c == NLBLOCK->nl[`0`])
1001	{
1002	could_continue = partial_newline = TRUE;
1003	}
1004	else
1005	{
1006	ADD_NEW(state_offset + `1`, `0`);
1007	}
1008	}
1009	break;
1010
1011	/-----------------------------------------------------------------/
1012	case OP_ALLANY:
1013	if (clen > `0`)
1014	{ ADD_NEW(state_offset + `1`, `0`); }
1015	break;
1016
1017	/-----------------------------------------------------------------/
1018	case OP_EODN:
1019	if (clen == `0` \|\| (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1020	{
1021	if ((mb->moptions & PCRE2_PARTIAL_HARD) != `0`)
1022	return PCRE2_ERROR_PARTIAL;
1023	ADD_ACTIVE(state_offset + `1`, `0`);
1024	}
1025	break;
1026
1027	/-----------------------------------------------------------------/
1028	case OP_DOLL:
1029	if ((mb->moptions & PCRE2_NOTEOL) == `0`)
1030	{
1031	if (clen == `0` && (mb->moptions & PCRE2_PARTIAL_HARD) != `0`)
1032	could_continue = TRUE;
1033	else if (clen == `0` \|\|
1034	((mb->poptions & PCRE2_DOLLAR_ENDONLY) == `0` && IS_NEWLINE(ptr) &&
1035	(ptr == end_subject - mb->nllen)
1036	))
1037	{ ADD_ACTIVE(state_offset + `1`, `0`); }
1038	else if (ptr + `1` >= mb->end_subject &&
1039	(mb->moptions & (PCRE2_PARTIAL_HARD\|PCRE2_PARTIAL_SOFT)) != `0` &&
1040	NLBLOCK->nltype == NLTYPE_FIXED &&
1041	NLBLOCK->nllen == `2` &&
1042	c == NLBLOCK->nl[`0`])
1043	{
1044	if ((mb->moptions & PCRE2_PARTIAL_HARD) != `0`)
1045	{
1046	reset_could_continue = TRUE;
1047	ADD_NEW_DATA(-(state_offset + `1`), `0`, `1`);
1048	}
1049	else could_continue = partial_newline = TRUE;
1050	}
1051	}
1052	break;
1053
1054	/-----------------------------------------------------------------/
1055	case OP_DOLLM:
1056	if ((mb->moptions & PCRE2_NOTEOL) == `0`)
1057	{
1058	if (clen == `0` && (mb->moptions & PCRE2_PARTIAL_HARD) != `0`)
1059	could_continue = TRUE;
1060	else if (clen == `0` \|\|
1061	((mb->poptions & PCRE2_DOLLAR_ENDONLY) == `0` && IS_NEWLINE(ptr)))
1062	{ ADD_ACTIVE(state_offset + `1`, `0`); }
1063	else if (ptr + `1` >= mb->end_subject &&
1064	(mb->moptions & (PCRE2_PARTIAL_HARD\|PCRE2_PARTIAL_SOFT)) != `0` &&
1065	NLBLOCK->nltype == NLTYPE_FIXED &&
1066	NLBLOCK->nllen == `2` &&
1067	c == NLBLOCK->nl[`0`])
1068	{
1069	if ((mb->moptions & PCRE2_PARTIAL_HARD) != `0`)
1070	{
1071	reset_could_continue = TRUE;
1072	ADD_NEW_DATA(-(state_offset + `1`), `0`, `1`);
1073	}
1074	else could_continue = partial_newline = TRUE;
1075	}
1076	}
1077	else if (IS_NEWLINE(ptr))
1078	{ ADD_ACTIVE(state_offset + `1`, `0`); }
1079	break;
1080
1081	/-----------------------------------------------------------------/
1082
1083	case OP_DIGIT:
1084	case OP_WHITESPACE:
1085	case OP_WORDCHAR:
1086	if (clen > `0` && c < `256` &&
1087	((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != `0`)
1088	{ ADD_NEW(state_offset + `1`, `0`); }
1089	break;
1090
1091	/-----------------------------------------------------------------/
1092	case OP_NOT_DIGIT:
1093	case OP_NOT_WHITESPACE:
1094	case OP_NOT_WORDCHAR:
1095	if (clen > `0` && (c >= `256` \|\|
1096	((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != `0`))
1097	{ ADD_NEW(state_offset + `1`, `0`); }
1098	break;
1099
1100	/-----------------------------------------------------------------/
1101	case OP_WORD_BOUNDARY:
1102	case OP_NOT_WORD_BOUNDARY:
1103	{
1104	int left_word, right_word;
1105
1106	if (ptr > start_subject)
1107	{
1108	PCRE2_SPTR temp = ptr - `1`;
1109	if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1110	#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1111	if (utf) { BACKCHAR(temp); }
1112	#endif
1113	GETCHARTEST(d, temp);
1114	#ifdef SUPPORT_UNICODE
1115	if ((mb->poptions & PCRE2_UCP) != `0`)
1116	{
1117	if (d == `'_'`) left_word = TRUE; else
1118	{
1119	uint32_t cat = UCD_CATEGORY(d);
1120	left_word = (cat == ucp_L \|\| cat == ucp_N);
1121	}
1122	}
1123	else
1124	#endif
1125	left_word = d < `256` && (ctypes[d] & ctype_word) != `0`;
1126	}
1127	else left_word = FALSE;
1128
1129	if (clen > `0`)
1130	{
1131	if (ptr >= mb->last_used_ptr)
1132	{
1133	PCRE2_SPTR temp = ptr + `1`;
1134	#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1135	if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1136	#endif
1137	mb->last_used_ptr = temp;
1138	}
1139	#ifdef SUPPORT_UNICODE
1140	if ((mb->poptions & PCRE2_UCP) != `0`)
1141	{
1142	if (c == `'_'`) right_word = TRUE; else
1143	{
1144	uint32_t cat = UCD_CATEGORY(c);
1145	right_word = (cat == ucp_L \|\| cat == ucp_N);
1146	}
1147	}
1148	else
1149	#endif
1150	right_word = c < `256` && (ctypes[c] & ctype_word) != `0`;
1151	}
1152	else right_word = FALSE;
1153
1154	if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1155	{ ADD_ACTIVE(state_offset + `1`, `0`); }
1156	}
1157	break;
1158
1159
1160	/-----------------------------------------------------------------/
1161	/ Check the next character by Unicode property. We will get here only*
1162	if the support is in the binary; otherwise a compile-time error occurs.
1163	*/
1164
1165	#ifdef SUPPORT_UNICODE
1166	case OP_PROP:
1167	case OP_NOTPROP:
1168	if (clen > `0`)
1169	{
1170	BOOL OK;
1171	const uint32_t *cp;
1172	const ucd_record * prop = GET_UCD(c);
1173	switch(code[`1`])
1174	{
1175	case PT_ANY:
1176	OK = TRUE;
1177	break;
1178
1179	case PT_LAMP:
1180	OK = prop->chartype == ucp_Lu \|\| prop->chartype == ucp_Ll \|\|
1181	prop->chartype == ucp_Lt;
1182	break;
1183
1184	case PT_GC:
1185	OK = PRIV(ucp_gentype)[prop->chartype] == code[`2`];
1186	break;
1187
1188	case PT_PC:
1189	OK = prop->chartype == code[`2`];
1190	break;
1191
1192	case PT_SC:
1193	OK = prop->script == code[`2`];
1194	break;
1195
1196	case PT_SCX:
1197	OK = (prop->script == code[`2`] \|\|
1198	MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[`2`]) != `0`);
1199	break;
1200
1201	/ These are specials for combination cases. /
1202
1203	case PT_ALNUM:
1204	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
1205	PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1206	break;
1207
1208	/ Perl space used to exclude VT, but from Perl 5.18 it is included,*
1209	which means that Perl space and POSIX space are now identical. PCRE
1210	was changed at release 8.34. /*
1211
1212	case PT_SPACE: / Perl space /
1213	case PT_PXSPACE: / POSIX space /
1214	switch(c)
1215	{
1216	HSPACE_CASES:
1217	VSPACE_CASES:
1218	OK = TRUE;
1219	break;
1220
1221	default:
1222	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1223	break;
1224	}
1225	break;
1226
1227	case PT_WORD:
1228	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
1229	PRIV(ucp_gentype)[prop->chartype] == ucp_N \|\|
1230	c == CHAR_UNDERSCORE;
1231	break;
1232
1233	case PT_CLIST:
1234	cp = PRIV(ucd_caseless_sets) + code[`2`];
1235	for (;;)
1236	{
1237	if (c < cp) { OK = FALSE; break*; }
1238	if (c == cp++) { OK = TRUE; break*; }
1239	}
1240	break;
1241
1242	case PT_UCNC:
1243	OK = c == CHAR_DOLLAR_SIGN \|\| c == CHAR_COMMERCIAL_AT \|\|
1244	c == CHAR_GRAVE_ACCENT \|\| (c >= `0xa0` && c <= `0xd7ff`) \|\|
1245	c >= `0xe000`;
1246	break;
1247
1248	case PT_BIDICL:
1249	OK = UCD_BIDICLASS(c) == code[`2`];
1250	break;
1251
1252	case PT_BOOL:
1253	OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1254	UCD_BPROPS_PROP(prop), code[`2`]) != `0`;
1255	break;
1256
1257	/ Should never occur, but keep compilers from grumbling. /
1258
1259	default:
1260	OK = codevalue != OP_PROP;
1261	break;
1262	}
1263
1264	if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + `3`, `0`); }
1265	}
1266	break;
1267	#endif
1268
1269
1270
1271	/ ========================================================================== /
1272	/ These opcodes likewise inspect the subject character, but have an*
1273	argument that is not a data character. It is one of these opcodes:
1274	OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1275	OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. /*
1276
1277	case OP_TYPEPLUS:
1278	case OP_TYPEMINPLUS:
1279	case OP_TYPEPOSPLUS:
1280	count = current_state->count; / Already matched /
1281	if (count > `0`) { ADD_ACTIVE(state_offset + `2`, `0`); }
1282	if (clen > `0`)
1283	{
1284	if (d == OP_ANY && ptr + `1` >= mb->end_subject &&
1285	(mb->moptions & (PCRE2_PARTIAL_HARD)) != `0` &&
1286	NLBLOCK->nltype == NLTYPE_FIXED &&
1287	NLBLOCK->nllen == `2` &&
1288	c == NLBLOCK->nl[`0`])
1289	{
1290	could_continue = partial_newline = TRUE;
1291	}
1292	else if ((c >= `256` && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) \|\|
1293	(c < `256` &&
1294	(d != OP_ANY \|\| !IS_NEWLINE(ptr)) &&
1295	((ctypes[c] & toptable1[d]) ^ toptable2[d]) != `0`))
1296	{
1297	if (count > `0` && codevalue == OP_TYPEPOSPLUS)
1298	{
1299	active_count--; / Remove non-match possibility /
1300	next_active_state--;
1301	}
1302	count++;
1303	ADD_NEW(state_offset, count);
1304	}
1305	}
1306	break;
1307
1308	/-----------------------------------------------------------------/
1309	case OP_TYPEQUERY:
1310	case OP_TYPEMINQUERY:
1311	case OP_TYPEPOSQUERY:
1312	ADD_ACTIVE(state_offset + `2`, `0`);
1313	if (clen > `0`)
1314	{
1315	if (d == OP_ANY && ptr + `1` >= mb->end_subject &&
1316	(mb->moptions & (PCRE2_PARTIAL_HARD)) != `0` &&
1317	NLBLOCK->nltype == NLTYPE_FIXED &&
1318	NLBLOCK->nllen == `2` &&
1319	c == NLBLOCK->nl[`0`])
1320	{
1321	could_continue = partial_newline = TRUE;
1322	}
1323	else if ((c >= `256` && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) \|\|
1324	(c < `256` &&
1325	(d != OP_ANY \|\| !IS_NEWLINE(ptr)) &&
1326	((ctypes[c] & toptable1[d]) ^ toptable2[d]) != `0`))
1327	{
1328	if (codevalue == OP_TYPEPOSQUERY)
1329	{
1330	active_count--; / Remove non-match possibility /
1331	next_active_state--;
1332	}
1333	ADD_NEW(state_offset + `2`, `0`);
1334	}
1335	}
1336	break;
1337
1338	/-----------------------------------------------------------------/
1339	case OP_TYPESTAR:
1340	case OP_TYPEMINSTAR:
1341	case OP_TYPEPOSSTAR:
1342	ADD_ACTIVE(state_offset + `2`, `0`);
1343	if (clen > `0`)
1344	{
1345	if (d == OP_ANY && ptr + `1` >= mb->end_subject &&
1346	(mb->moptions & (PCRE2_PARTIAL_HARD)) != `0` &&
1347	NLBLOCK->nltype == NLTYPE_FIXED &&
1348	NLBLOCK->nllen == `2` &&
1349	c == NLBLOCK->nl[`0`])
1350	{
1351	could_continue = partial_newline = TRUE;
1352	}
1353	else if ((c >= `256` && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) \|\|
1354	(c < `256` &&
1355	(d != OP_ANY \|\| !IS_NEWLINE(ptr)) &&
1356	((ctypes[c] & toptable1[d]) ^ toptable2[d]) != `0`))
1357	{
1358	if (codevalue == OP_TYPEPOSSTAR)
1359	{
1360	active_count--; / Remove non-match possibility /
1361	next_active_state--;
1362	}
1363	ADD_NEW(state_offset, `0`);
1364	}
1365	}
1366	break;
1367
1368	/-----------------------------------------------------------------/
1369	case OP_TYPEEXACT:
1370	count = current_state->count; / Number already matched /
1371	if (clen > `0`)
1372	{
1373	if (d == OP_ANY && ptr + `1` >= mb->end_subject &&
1374	(mb->moptions & (PCRE2_PARTIAL_HARD)) != `0` &&
1375	NLBLOCK->nltype == NLTYPE_FIXED &&
1376	NLBLOCK->nllen == `2` &&
1377	c == NLBLOCK->nl[`0`])
1378	{
1379	could_continue = partial_newline = TRUE;
1380	}
1381	else if ((c >= `256` && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) \|\|
1382	(c < `256` &&
1383	(d != OP_ANY \|\| !IS_NEWLINE(ptr)) &&
1384	((ctypes[c] & toptable1[d]) ^ toptable2[d]) != `0`))
1385	{
1386	if (++count >= (int)GET2(code, `1`))
1387	{ ADD_NEW(state_offset + `1` + IMM2_SIZE + `1`, `0`); }
1388	else
1389	{ ADD_NEW(state_offset, count); }
1390	}
1391	}
1392	break;
1393
1394	/-----------------------------------------------------------------/
1395	case OP_TYPEUPTO:
1396	case OP_TYPEMINUPTO:
1397	case OP_TYPEPOSUPTO:
1398	ADD_ACTIVE(state_offset + `2` + IMM2_SIZE, `0`);
1399	count = current_state->count; / Number already matched /
1400	if (clen > `0`)
1401	{
1402	if (d == OP_ANY && ptr + `1` >= mb->end_subject &&
1403	(mb->moptions & (PCRE2_PARTIAL_HARD)) != `0` &&
1404	NLBLOCK->nltype == NLTYPE_FIXED &&
1405	NLBLOCK->nllen == `2` &&
1406	c == NLBLOCK->nl[`0`])
1407	{
1408	could_continue = partial_newline = TRUE;
1409	}
1410	else if ((c >= `256` && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) \|\|
1411	(c < `256` &&
1412	(d != OP_ANY \|\| !IS_NEWLINE(ptr)) &&
1413	((ctypes[c] & toptable1[d]) ^ toptable2[d]) != `0`))
1414	{
1415	if (codevalue == OP_TYPEPOSUPTO)
1416	{
1417	active_count--; / Remove non-match possibility /
1418	next_active_state--;
1419	}
1420	if (++count >= (int)GET2(code, `1`))
1421	{ ADD_NEW(state_offset + `2` + IMM2_SIZE, `0`); }
1422	else
1423	{ ADD_NEW(state_offset, count); }
1424	}
1425	}
1426	break;
1427
1428	/ ========================================================================== /
1429	/ These are virtual opcodes that are used when something like*
1430	OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1431	argument. It keeps the code above fast for the other cases. The argument
1432	is in the d variable. /*
1433
1434	#ifdef SUPPORT_UNICODE
1435	case OP_PROP_EXTRA + OP_TYPEPLUS:
1436	case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1437	case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1438	count = current_state->count; / Already matched /
1439	if (count > `0`) { ADD_ACTIVE(state_offset + `4`, `0`); }
1440	if (clen > `0`)
1441	{
1442	BOOL OK;
1443	const uint32_t *cp;
1444	const ucd_record * prop = GET_UCD(c);
1445	switch(code[`2`])
1446	{
1447	case PT_ANY:
1448	OK = TRUE;
1449	break;
1450
1451	case PT_LAMP:
1452	OK = prop->chartype == ucp_Lu \|\| prop->chartype == ucp_Ll \|\|
1453	prop->chartype == ucp_Lt;
1454	break;
1455
1456	case PT_GC:
1457	OK = PRIV(ucp_gentype)[prop->chartype] == code[`3`];
1458	break;
1459
1460	case PT_PC:
1461	OK = prop->chartype == code[`3`];
1462	break;
1463
1464	case PT_SC:
1465	OK = prop->script == code[`3`];
1466	break;
1467
1468	case PT_SCX:
1469	OK = (prop->script == code[`3`] \|\|
1470	MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[`3`]) != `0`);
1471	break;
1472
1473	/ These are specials for combination cases. /
1474
1475	case PT_ALNUM:
1476	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
1477	PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1478	break;
1479
1480	/ Perl space used to exclude VT, but from Perl 5.18 it is included,*
1481	which means that Perl space and POSIX space are now identical. PCRE
1482	was changed at release 8.34. /*
1483
1484	case PT_SPACE: / Perl space /
1485	case PT_PXSPACE: / POSIX space /
1486	switch(c)
1487	{
1488	HSPACE_CASES:
1489	VSPACE_CASES:
1490	OK = TRUE;
1491	break;
1492
1493	default:
1494	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1495	break;
1496	}
1497	break;
1498
1499	case PT_WORD:
1500	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
1501	PRIV(ucp_gentype)[prop->chartype] == ucp_N \|\|
1502	c == CHAR_UNDERSCORE;
1503	break;
1504
1505	case PT_CLIST:
1506	cp = PRIV(ucd_caseless_sets) + code[`3`];
1507	for (;;)
1508	{
1509	if (c < cp) { OK = FALSE; break*; }
1510	if (c == cp++) { OK = TRUE; break*; }
1511	}
1512	break;
1513
1514	case PT_UCNC:
1515	OK = c == CHAR_DOLLAR_SIGN \|\| c == CHAR_COMMERCIAL_AT \|\|
1516	c == CHAR_GRAVE_ACCENT \|\| (c >= `0xa0` && c <= `0xd7ff`) \|\|
1517	c >= `0xe000`;
1518	break;
1519
1520	case PT_BIDICL:
1521	OK = UCD_BIDICLASS(c) == code[`3`];
1522	break;
1523
1524	case PT_BOOL:
1525	OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1526	UCD_BPROPS_PROP(prop), code[`3`]) != `0`;
1527	break;
1528
1529	/ Should never occur, but keep compilers from grumbling. /
1530
1531	default:
1532	OK = codevalue != OP_PROP;
1533	break;
1534	}
1535
1536	if (OK == (d == OP_PROP))
1537	{
1538	if (count > `0` && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1539	{
1540	active_count--; / Remove non-match possibility /
1541	next_active_state--;
1542	}
1543	count++;
1544	ADD_NEW(state_offset, count);
1545	}
1546	}
1547	break;
1548
1549	/-----------------------------------------------------------------/
1550	case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1551	case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1552	case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1553	count = current_state->count; / Already matched /
1554	if (count > `0`) { ADD_ACTIVE(state_offset + `2`, `0`); }
1555	if (clen > `0`)
1556	{
1557	int ncount = `0`;
1558	if (count > `0` && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1559	{
1560	active_count--; / Remove non-match possibility /
1561	next_active_state--;
1562	}
1563	(void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1564	&ncount);
1565	count++;
1566	ADD_NEW_DATA(-state_offset, count, ncount);
1567	}
1568	break;
1569	#endif
1570
1571	/-----------------------------------------------------------------/
1572	case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1573	case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1574	case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1575	count = current_state->count; / Already matched /
1576	if (count > `0`) { ADD_ACTIVE(state_offset + `2`, `0`); }
1577	if (clen > `0`)
1578	{
1579	int ncount = `0`;
1580	switch (c)
1581	{
1582	case CHAR_VT:
1583	case CHAR_FF:
1584	case CHAR_NEL:
1585	#ifndef EBCDIC
1586	case `0x2028`:
1587	case `0x2029`:
1588	#endif /* Not EBCDIC */
1589	if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1590	goto ANYNL01;
1591
1592	case CHAR_CR:
1593	if (ptr + `1` < end_subject && UCHAR21TEST(ptr + `1`) == CHAR_LF) ncount = `1`;
1594	/ Fall through /
1595
1596	ANYNL01:
1597	case CHAR_LF:
1598	if (count > `0` && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1599	{
1600	active_count--; / Remove non-match possibility /
1601	next_active_state--;
1602	}
1603	count++;
1604	ADD_NEW_DATA(-state_offset, count, ncount);
1605	break;
1606
1607	default:
1608	break;
1609	}
1610	}
1611	break;
1612
1613	/-----------------------------------------------------------------/
1614	case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1615	case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1616	case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1617	count = current_state->count; / Already matched /
1618	if (count > `0`) { ADD_ACTIVE(state_offset + `2`, `0`); }
1619	if (clen > `0`)
1620	{
1621	BOOL OK;
1622	switch (c)
1623	{
1624	VSPACE_CASES:
1625	OK = TRUE;
1626	break;
1627
1628	default:
1629	OK = FALSE;
1630	break;
1631	}
1632
1633	if (OK == (d == OP_VSPACE))
1634	{
1635	if (count > `0` && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1636	{
1637	active_count--; / Remove non-match possibility /
1638	next_active_state--;
1639	}
1640	count++;
1641	ADD_NEW_DATA(-state_offset, count, `0`);
1642	}
1643	}
1644	break;
1645
1646	/-----------------------------------------------------------------/
1647	case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1648	case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1649	case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1650	count = current_state->count; / Already matched /
1651	if (count > `0`) { ADD_ACTIVE(state_offset + `2`, `0`); }
1652	if (clen > `0`)
1653	{
1654	BOOL OK;
1655	switch (c)
1656	{
1657	HSPACE_CASES:
1658	OK = TRUE;
1659	break;
1660
1661	default:
1662	OK = FALSE;
1663	break;
1664	}
1665
1666	if (OK == (d == OP_HSPACE))
1667	{
1668	if (count > `0` && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1669	{
1670	active_count--; / Remove non-match possibility /
1671	next_active_state--;
1672	}
1673	count++;
1674	ADD_NEW_DATA(-state_offset, count, `0`);
1675	}
1676	}
1677	break;
1678
1679	/-----------------------------------------------------------------/
1680	#ifdef SUPPORT_UNICODE
1681	case OP_PROP_EXTRA + OP_TYPEQUERY:
1682	case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1683	case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1684	count = `4`;
1685	goto QS1;
1686
1687	case OP_PROP_EXTRA + OP_TYPESTAR:
1688	case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1689	case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1690	count = `0`;
1691
1692	QS1:
1693
1694	ADD_ACTIVE(state_offset + `4`, `0`);
1695	if (clen > `0`)
1696	{
1697	BOOL OK;
1698	const uint32_t *cp;
1699	const ucd_record * prop = GET_UCD(c);
1700	switch(code[`2`])
1701	{
1702	case PT_ANY:
1703	OK = TRUE;
1704	break;
1705
1706	case PT_LAMP:
1707	OK = prop->chartype == ucp_Lu \|\| prop->chartype == ucp_Ll \|\|
1708	prop->chartype == ucp_Lt;
1709	break;
1710
1711	case PT_GC:
1712	OK = PRIV(ucp_gentype)[prop->chartype] == code[`3`];
1713	break;
1714
1715	case PT_PC:
1716	OK = prop->chartype == code[`3`];
1717	break;
1718
1719	case PT_SC:
1720	OK = prop->script == code[`3`];
1721	break;
1722
1723	case PT_SCX:
1724	OK = (prop->script == code[`3`] \|\|
1725	MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[`3`]) != `0`);
1726	break;
1727
1728	/ These are specials for combination cases. /
1729
1730	case PT_ALNUM:
1731	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
1732	PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1733	break;
1734
1735	/ Perl space used to exclude VT, but from Perl 5.18 it is included,*
1736	which means that Perl space and POSIX space are now identical. PCRE
1737	was changed at release 8.34. /*
1738
1739	case PT_SPACE: / Perl space /
1740	case PT_PXSPACE: / POSIX space /
1741	switch(c)
1742	{
1743	HSPACE_CASES:
1744	VSPACE_CASES:
1745	OK = TRUE;
1746	break;
1747
1748	default:
1749	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1750	break;
1751	}
1752	break;
1753
1754	case PT_WORD:
1755	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
1756	PRIV(ucp_gentype)[prop->chartype] == ucp_N \|\|
1757	c == CHAR_UNDERSCORE;
1758	break;
1759
1760	case PT_CLIST:
1761	cp = PRIV(ucd_caseless_sets) + code[`3`];
1762	for (;;)
1763	{
1764	if (c < cp) { OK = FALSE; break*; }
1765	if (c == cp++) { OK = TRUE; break*; }
1766	}
1767	break;
1768
1769	case PT_UCNC:
1770	OK = c == CHAR_DOLLAR_SIGN \|\| c == CHAR_COMMERCIAL_AT \|\|
1771	c == CHAR_GRAVE_ACCENT \|\| (c >= `0xa0` && c <= `0xd7ff`) \|\|
1772	c >= `0xe000`;
1773	break;
1774
1775	case PT_BIDICL:
1776	OK = UCD_BIDICLASS(c) == code[`3`];
1777	break;
1778
1779	case PT_BOOL:
1780	OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1781	UCD_BPROPS_PROP(prop), code[`3`]) != `0`;
1782	break;
1783
1784	/ Should never occur, but keep compilers from grumbling. /
1785
1786	default:
1787	OK = codevalue != OP_PROP;
1788	break;
1789	}
1790
1791	if (OK == (d == OP_PROP))
1792	{
1793	if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR \|\|
1794	codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1795	{
1796	active_count--; / Remove non-match possibility /
1797	next_active_state--;
1798	}
1799	ADD_NEW(state_offset + count, `0`);
1800	}
1801	}
1802	break;
1803
1804	/-----------------------------------------------------------------/
1805	case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1806	case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1807	case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1808	count = `2`;
1809	goto QS2;
1810
1811	case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1812	case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1813	case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1814	count = `0`;
1815
1816	QS2:
1817
1818	ADD_ACTIVE(state_offset + `2`, `0`);
1819	if (clen > `0`)
1820	{
1821	int ncount = `0`;
1822	if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR \|\|
1823	codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1824	{
1825	active_count--; / Remove non-match possibility /
1826	next_active_state--;
1827	}
1828	(void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1829	&ncount);
1830	ADD_NEW_DATA(-(state_offset + count), `0`, ncount);
1831	}
1832	break;
1833	#endif
1834
1835	/-----------------------------------------------------------------/
1836	case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1837	case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1838	case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1839	count = `2`;
1840	goto QS3;
1841
1842	case OP_ANYNL_EXTRA + OP_TYPESTAR:
1843	case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1844	case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1845	count = `0`;
1846
1847	QS3:
1848	ADD_ACTIVE(state_offset + `2`, `0`);
1849	if (clen > `0`)
1850	{
1851	int ncount = `0`;
1852	switch (c)
1853	{
1854	case CHAR_VT:
1855	case CHAR_FF:
1856	case CHAR_NEL:
1857	#ifndef EBCDIC
1858	case `0x2028`:
1859	case `0x2029`:
1860	#endif /* Not EBCDIC */
1861	if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1862	goto ANYNL02;
1863
1864	case CHAR_CR:
1865	if (ptr + `1` < end_subject && UCHAR21TEST(ptr + `1`) == CHAR_LF) ncount = `1`;
1866	/ Fall through /
1867
1868	ANYNL02:
1869	case CHAR_LF:
1870	if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR \|\|
1871	codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1872	{
1873	active_count--; / Remove non-match possibility /
1874	next_active_state--;
1875	}
1876	ADD_NEW_DATA(-(state_offset + (int)count), `0`, ncount);
1877	break;
1878
1879	default:
1880	break;
1881	}
1882	}
1883	break;
1884
1885	/-----------------------------------------------------------------/
1886	case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1887	case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1888	case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1889	count = `2`;
1890	goto QS4;
1891
1892	case OP_VSPACE_EXTRA + OP_TYPESTAR:
1893	case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1894	case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1895	count = `0`;
1896
1897	QS4:
1898	ADD_ACTIVE(state_offset + `2`, `0`);
1899	if (clen > `0`)
1900	{
1901	BOOL OK;
1902	switch (c)
1903	{
1904	VSPACE_CASES:
1905	OK = TRUE;
1906	break;
1907
1908	default:
1909	OK = FALSE;
1910	break;
1911	}
1912	if (OK == (d == OP_VSPACE))
1913	{
1914	if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR \|\|
1915	codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1916	{
1917	active_count--; / Remove non-match possibility /
1918	next_active_state--;
1919	}
1920	ADD_NEW_DATA(-(state_offset + (int)count), `0`, `0`);
1921	}
1922	}
1923	break;
1924
1925	/-----------------------------------------------------------------/
1926	case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1927	case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1928	case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1929	count = `2`;
1930	goto QS5;
1931
1932	case OP_HSPACE_EXTRA + OP_TYPESTAR:
1933	case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1934	case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1935	count = `0`;
1936
1937	QS5:
1938	ADD_ACTIVE(state_offset + `2`, `0`);
1939	if (clen > `0`)
1940	{
1941	BOOL OK;
1942	switch (c)
1943	{
1944	HSPACE_CASES:
1945	OK = TRUE;
1946	break;
1947
1948	default:
1949	OK = FALSE;
1950	break;
1951	}
1952
1953	if (OK == (d == OP_HSPACE))
1954	{
1955	if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR \|\|
1956	codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1957	{
1958	active_count--; / Remove non-match possibility /
1959	next_active_state--;
1960	}
1961	ADD_NEW_DATA(-(state_offset + (int)count), `0`, `0`);
1962	}
1963	}
1964	break;
1965
1966	/-----------------------------------------------------------------/
1967	#ifdef SUPPORT_UNICODE
1968	case OP_PROP_EXTRA + OP_TYPEEXACT:
1969	case OP_PROP_EXTRA + OP_TYPEUPTO:
1970	case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1971	case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1972	if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1973	{ ADD_ACTIVE(state_offset + `1` + IMM2_SIZE + `3`, `0`); }
1974	count = current_state->count; / Number already matched /
1975	if (clen > `0`)
1976	{
1977	BOOL OK;
1978	const uint32_t *cp;
1979	const ucd_record * prop = GET_UCD(c);
1980	switch(code[`1` + IMM2_SIZE + `1`])
1981	{
1982	case PT_ANY:
1983	OK = TRUE;
1984	break;
1985
1986	case PT_LAMP:
1987	OK = prop->chartype == ucp_Lu \|\| prop->chartype == ucp_Ll \|\|
1988	prop->chartype == ucp_Lt;
1989	break;
1990
1991	case PT_GC:
1992	OK = PRIV(ucp_gentype)[prop->chartype] == code[`1` + IMM2_SIZE + `2`];
1993	break;
1994
1995	case PT_PC:
1996	OK = prop->chartype == code[`1` + IMM2_SIZE + `2`];
1997	break;
1998
1999	case PT_SC:
2000	OK = prop->script == code[`1` + IMM2_SIZE + `2`];
2001	break;
2002
2003	case PT_SCX:
2004	OK = (prop->script == code[`1` + IMM2_SIZE + `2`] \|\|
2005	MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop),
2006	code[`1` + IMM2_SIZE + `2`]) != `0`);
2007	break;
2008
2009	/ These are specials for combination cases. /
2010
2011	case PT_ALNUM:
2012	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
2013	PRIV(ucp_gentype)[prop->chartype] == ucp_N;
2014	break;
2015
2016	/ Perl space used to exclude VT, but from Perl 5.18 it is included,*
2017	which means that Perl space and POSIX space are now identical. PCRE
2018	was changed at release 8.34. /*
2019
2020	case PT_SPACE: / Perl space /
2021	case PT_PXSPACE: / POSIX space /
2022	switch(c)
2023	{
2024	HSPACE_CASES:
2025	VSPACE_CASES:
2026	OK = TRUE;
2027	break;
2028
2029	default:
2030	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
2031	break;
2032	}
2033	break;
2034
2035	case PT_WORD:
2036	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
2037	PRIV(ucp_gentype)[prop->chartype] == ucp_N \|\|
2038	c == CHAR_UNDERSCORE;
2039	break;
2040
2041	case PT_CLIST:
2042	cp = PRIV(ucd_caseless_sets) + code[`1` + IMM2_SIZE + `2`];
2043	for (;;)
2044	{
2045	if (c < cp) { OK = FALSE; break*; }
2046	if (c == cp++) { OK = TRUE; break*; }
2047	}
2048	break;
2049
2050	case PT_UCNC:
2051	OK = c == CHAR_DOLLAR_SIGN \|\| c == CHAR_COMMERCIAL_AT \|\|
2052	c == CHAR_GRAVE_ACCENT \|\| (c >= `0xa0` && c <= `0xd7ff`) \|\|
2053	c >= `0xe000`;
2054	break;
2055
2056	case PT_BIDICL:
2057	OK = UCD_BIDICLASS(c) == code[`1` + IMM2_SIZE + `2`];
2058	break;
2059
2060	case PT_BOOL:
2061	OK = MAPBIT(PRIV(ucd_boolprop_sets) +
2062	UCD_BPROPS_PROP(prop), code[`1` + IMM2_SIZE + `2`]) != `0`;
2063	break;
2064
2065	/ Should never occur, but keep compilers from grumbling. /
2066
2067	default:
2068	OK = codevalue != OP_PROP;
2069	break;
2070	}
2071
2072	if (OK == (d == OP_PROP))
2073	{
2074	if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2075	{
2076	active_count--; / Remove non-match possibility /
2077	next_active_state--;
2078	}
2079	if (++count >= (int)GET2(code, `1`))
2080	{ ADD_NEW(state_offset + `1` + IMM2_SIZE + `3`, `0`); }
2081	else
2082	{ ADD_NEW(state_offset, count); }
2083	}
2084	}
2085	break;
2086
2087	/-----------------------------------------------------------------/
2088	case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2089	case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2090	case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2091	case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2092	if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2093	{ ADD_ACTIVE(state_offset + `2` + IMM2_SIZE, `0`); }
2094	count = current_state->count; / Number already matched /
2095	if (clen > `0`)
2096	{
2097	PCRE2_SPTR nptr;
2098	int ncount = `0`;
2099	if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2100	{
2101	active_count--; / Remove non-match possibility /
2102	next_active_state--;
2103	}
2104	nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2105	&ncount);
2106	if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != `0`)
2107	reset_could_continue = TRUE;
2108	if (++count >= (int)GET2(code, `1`))
2109	{ ADD_NEW_DATA(-(state_offset + `2` + IMM2_SIZE), `0`, ncount); }
2110	else
2111	{ ADD_NEW_DATA(-state_offset, count, ncount); }
2112	}
2113	break;
2114	#endif
2115
2116	/-----------------------------------------------------------------/
2117	case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2118	case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2119	case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2120	case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2121	if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2122	{ ADD_ACTIVE(state_offset + `2` + IMM2_SIZE, `0`); }
2123	count = current_state->count; / Number already matched /
2124	if (clen > `0`)
2125	{
2126	int ncount = `0`;
2127	switch (c)
2128	{
2129	case CHAR_VT:
2130	case CHAR_FF:
2131	case CHAR_NEL:
2132	#ifndef EBCDIC
2133	case `0x2028`:
2134	case `0x2029`:
2135	#endif /* Not EBCDIC */
2136	if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2137	goto ANYNL03;
2138
2139	case CHAR_CR:
2140	if (ptr + `1` < end_subject && UCHAR21TEST(ptr + `1`) == CHAR_LF) ncount = `1`;
2141	/ Fall through /
2142
2143	ANYNL03:
2144	case CHAR_LF:
2145	if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2146	{
2147	active_count--; / Remove non-match possibility /
2148	next_active_state--;
2149	}
2150	if (++count >= (int)GET2(code, `1`))
2151	{ ADD_NEW_DATA(-(state_offset + `2` + IMM2_SIZE), `0`, ncount); }
2152	else
2153	{ ADD_NEW_DATA(-state_offset, count, ncount); }
2154	break;
2155
2156	default:
2157	break;
2158	}
2159	}
2160	break;
2161
2162	/-----------------------------------------------------------------/
2163	case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2164	case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2165	case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2166	case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2167	if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2168	{ ADD_ACTIVE(state_offset + `2` + IMM2_SIZE, `0`); }
2169	count = current_state->count; / Number already matched /
2170	if (clen > `0`)
2171	{
2172	BOOL OK;
2173	switch (c)
2174	{
2175	VSPACE_CASES:
2176	OK = TRUE;
2177	break;
2178
2179	default:
2180	OK = FALSE;
2181	}
2182
2183	if (OK == (d == OP_VSPACE))
2184	{
2185	if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2186	{
2187	active_count--; / Remove non-match possibility /
2188	next_active_state--;
2189	}
2190	if (++count >= (int)GET2(code, `1`))
2191	{ ADD_NEW_DATA(-(state_offset + `2` + IMM2_SIZE), `0`, `0`); }
2192	else
2193	{ ADD_NEW_DATA(-state_offset, count, `0`); }
2194	}
2195	}
2196	break;
2197
2198	/-----------------------------------------------------------------/
2199	case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2200	case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2201	case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2202	case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2203	if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2204	{ ADD_ACTIVE(state_offset + `2` + IMM2_SIZE, `0`); }
2205	count = current_state->count; / Number already matched /
2206	if (clen > `0`)
2207	{
2208	BOOL OK;
2209	switch (c)
2210	{
2211	HSPACE_CASES:
2212	OK = TRUE;
2213	break;
2214
2215	default:
2216	OK = FALSE;
2217	break;
2218	}
2219
2220	if (OK == (d == OP_HSPACE))
2221	{
2222	if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2223	{
2224	active_count--; / Remove non-match possibility /
2225	next_active_state--;
2226	}
2227	if (++count >= (int)GET2(code, `1`))
2228	{ ADD_NEW_DATA(-(state_offset + `2` + IMM2_SIZE), `0`, `0`); }
2229	else
2230	{ ADD_NEW_DATA(-state_offset, count, `0`); }
2231	}
2232	}
2233	break;
2234
2235	/ ========================================================================== /
2236	/ These opcodes are followed by a character that is usually compared*
2237	to the current subject character; it is loaded into d. We still get
2238	here even if there is no subject character, because in some cases zero
2239	repetitions are permitted. /*
2240
2241	/-----------------------------------------------------------------/
2242	case OP_CHAR:
2243	if (clen > `0` && c == d) { ADD_NEW(state_offset + dlen + `1`, `0`); }
2244	break;
2245
2246	/-----------------------------------------------------------------/
2247	case OP_CHARI:
2248	if (clen == `0`) break;
2249
2250	#ifdef SUPPORT_UNICODE
2251	if (utf_or_ucp)
2252	{
2253	if (c == d) { ADD_NEW(state_offset + dlen + `1`, `0`); } else
2254	{
2255	unsigned int othercase;
2256	if (c < `128`)
2257	othercase = fcc[c];
2258	else
2259	othercase = UCD_OTHERCASE(c);
2260	if (d == othercase) { ADD_NEW(state_offset + dlen + `1`, `0`); }
2261	}
2262	}
2263	else
2264	#endif /* SUPPORT_UNICODE */
2265	/ Not UTF or UCP mode /
2266	{
2267	if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2268	{ ADD_NEW(state_offset + `2`, `0`); }
2269	}
2270	break;
2271
2272
2273	#ifdef SUPPORT_UNICODE
2274	/-----------------------------------------------------------------/
2275	/ This is a tricky one because it can match more than one character.*
2276	Find out how many characters to skip, and then set up a negative state
2277	to wait for them to pass before continuing. /*
2278
2279	case OP_EXTUNI:
2280	if (clen > `0`)
2281	{
2282	int ncount = `0`;
2283	PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2284	end_subject, utf, &ncount);
2285	if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != `0`)
2286	reset_could_continue = TRUE;
2287	ADD_NEW_DATA(-(state_offset + `1`), `0`, ncount);
2288	}
2289	break;
2290	#endif
2291
2292	/-----------------------------------------------------------------/
2293	/ This is a tricky like EXTUNI because it too can match more than one*
2294	character (when CR is followed by LF). In this case, set up a negative
2295	state to wait for one character to pass before continuing. /*
2296
2297	case OP_ANYNL:
2298	if (clen > `0`) switch(c)
2299	{
2300	case CHAR_VT:
2301	case CHAR_FF:
2302	case CHAR_NEL:
2303	#ifndef EBCDIC
2304	case `0x2028`:
2305	case `0x2029`:
2306	#endif /* Not EBCDIC */
2307	if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2308	/ Fall through /
2309
2310	case CHAR_LF:
2311	ADD_NEW(state_offset + `1`, `0`);
2312	break;
2313
2314	case CHAR_CR:
2315	if (ptr + `1` >= end_subject)
2316	{
2317	ADD_NEW(state_offset + `1`, `0`);
2318	if ((mb->moptions & PCRE2_PARTIAL_HARD) != `0`)
2319	reset_could_continue = TRUE;
2320	}
2321	else if (UCHAR21TEST(ptr + `1`) == CHAR_LF)
2322	{
2323	ADD_NEW_DATA(-(state_offset + `1`), `0`, `1`);
2324	}
2325	else
2326	{
2327	ADD_NEW(state_offset + `1`, `0`);
2328	}
2329	break;
2330	}
2331	break;
2332
2333	/-----------------------------------------------------------------/
2334	case OP_NOT_VSPACE:
2335	if (clen > `0`) switch(c)
2336	{
2337	VSPACE_CASES:
2338	break;
2339
2340	default:
2341	ADD_NEW(state_offset + `1`, `0`);
2342	break;
2343	}
2344	break;
2345
2346	/-----------------------------------------------------------------/
2347	case OP_VSPACE:
2348	if (clen > `0`) switch(c)
2349	{
2350	VSPACE_CASES:
2351	ADD_NEW(state_offset + `1`, `0`);
2352	break;
2353
2354	default:
2355	break;
2356	}
2357	break;
2358
2359	/-----------------------------------------------------------------/
2360	case OP_NOT_HSPACE:
2361	if (clen > `0`) switch(c)
2362	{
2363	HSPACE_CASES:
2364	break;
2365
2366	default:
2367	ADD_NEW(state_offset + `1`, `0`);
2368	break;
2369	}
2370	break;
2371
2372	/-----------------------------------------------------------------/
2373	case OP_HSPACE:
2374	if (clen > `0`) switch(c)
2375	{
2376	HSPACE_CASES:
2377	ADD_NEW(state_offset + `1`, `0`);
2378	break;
2379
2380	default:
2381	break;
2382	}
2383	break;
2384
2385	/-----------------------------------------------------------------/
2386	/ Match a negated single character casefully. /
2387
2388	case OP_NOT:
2389	if (clen > `0` && c != d) { ADD_NEW(state_offset + dlen + `1`, `0`); }
2390	break;
2391
2392	/-----------------------------------------------------------------/
2393	/ Match a negated single character caselessly. /
2394
2395	case OP_NOTI:
2396	if (clen > `0`)
2397	{
2398	uint32_t otherd;
2399	#ifdef SUPPORT_UNICODE
2400	if (utf_or_ucp && d >= `128`)
2401	otherd = UCD_OTHERCASE(d);
2402	else
2403	#endif /* SUPPORT_UNICODE */
2404	otherd = TABLE_GET(d, fcc, d);
2405	if (c != d && c != otherd)
2406	{ ADD_NEW(state_offset + dlen + `1`, `0`); }
2407	}
2408	break;
2409
2410	/-----------------------------------------------------------------/
2411	case OP_PLUSI:
2412	case OP_MINPLUSI:
2413	case OP_POSPLUSI:
2414	case OP_NOTPLUSI:
2415	case OP_NOTMINPLUSI:
2416	case OP_NOTPOSPLUSI:
2417	caseless = TRUE;
2418	codevalue -= OP_STARI - OP_STAR;
2419
2420	/ Fall through /
2421	case OP_PLUS:
2422	case OP_MINPLUS:
2423	case OP_POSPLUS:
2424	case OP_NOTPLUS:
2425	case OP_NOTMINPLUS:
2426	case OP_NOTPOSPLUS:
2427	count = current_state->count; / Already matched /
2428	if (count > `0`) { ADD_ACTIVE(state_offset + dlen + `1`, `0`); }
2429	if (clen > `0`)
2430	{
2431	uint32_t otherd = NOTACHAR;
2432	if (caseless)
2433	{
2434	#ifdef SUPPORT_UNICODE
2435	if (utf_or_ucp && d >= `128`)
2436	otherd = UCD_OTHERCASE(d);
2437	else
2438	#endif /* SUPPORT_UNICODE */
2439	otherd = TABLE_GET(d, fcc, d);
2440	}
2441	if ((c == d \|\| c == otherd) == (codevalue < OP_NOTSTAR))
2442	{
2443	if (count > `0` &&
2444	(codevalue == OP_POSPLUS \|\| codevalue == OP_NOTPOSPLUS))
2445	{
2446	active_count--; / Remove non-match possibility /
2447	next_active_state--;
2448	}
2449	count++;
2450	ADD_NEW(state_offset, count);
2451	}
2452	}
2453	break;
2454
2455	/-----------------------------------------------------------------/
2456	case OP_QUERYI:
2457	case OP_MINQUERYI:
2458	case OP_POSQUERYI:
2459	case OP_NOTQUERYI:
2460	case OP_NOTMINQUERYI:
2461	case OP_NOTPOSQUERYI:
2462	caseless = TRUE;
2463	codevalue -= OP_STARI - OP_STAR;
2464	/ Fall through /
2465	case OP_QUERY:
2466	case OP_MINQUERY:
2467	case OP_POSQUERY:
2468	case OP_NOTQUERY:
2469	case OP_NOTMINQUERY:
2470	case OP_NOTPOSQUERY:
2471	ADD_ACTIVE(state_offset + dlen + `1`, `0`);
2472	if (clen > `0`)
2473	{
2474	uint32_t otherd = NOTACHAR;
2475	if (caseless)
2476	{
2477	#ifdef SUPPORT_UNICODE
2478	if (utf_or_ucp && d >= `128`)
2479	otherd = UCD_OTHERCASE(d);
2480	else
2481	#endif /* SUPPORT_UNICODE */
2482	otherd = TABLE_GET(d, fcc, d);
2483	}
2484	if ((c == d \|\| c == otherd) == (codevalue < OP_NOTSTAR))
2485	{
2486	if (codevalue == OP_POSQUERY \|\| codevalue == OP_NOTPOSQUERY)
2487	{
2488	active_count--; / Remove non-match possibility /
2489	next_active_state--;
2490	}
2491	ADD_NEW(state_offset + dlen + `1`, `0`);
2492	}
2493	}
2494	break;
2495
2496	/-----------------------------------------------------------------/
2497	case OP_STARI:
2498	case OP_MINSTARI:
2499	case OP_POSSTARI:
2500	case OP_NOTSTARI:
2501	case OP_NOTMINSTARI:
2502	case OP_NOTPOSSTARI:
2503	caseless = TRUE;
2504	codevalue -= OP_STARI - OP_STAR;
2505	/ Fall through /
2506	case OP_STAR:
2507	case OP_MINSTAR:
2508	case OP_POSSTAR:
2509	case OP_NOTSTAR:
2510	case OP_NOTMINSTAR:
2511	case OP_NOTPOSSTAR:
2512	ADD_ACTIVE(state_offset + dlen + `1`, `0`);
2513	if (clen > `0`)
2514	{
2515	uint32_t otherd = NOTACHAR;
2516	if (caseless)
2517	{
2518	#ifdef SUPPORT_UNICODE
2519	if (utf_or_ucp && d >= `128`)
2520	otherd = UCD_OTHERCASE(d);
2521	else
2522	#endif /* SUPPORT_UNICODE */
2523	otherd = TABLE_GET(d, fcc, d);
2524	}
2525	if ((c == d \|\| c == otherd) == (codevalue < OP_NOTSTAR))
2526	{
2527	if (codevalue == OP_POSSTAR \|\| codevalue == OP_NOTPOSSTAR)
2528	{
2529	active_count--; / Remove non-match possibility /
2530	next_active_state--;
2531	}
2532	ADD_NEW(state_offset, `0`);
2533	}
2534	}
2535	break;
2536
2537	/-----------------------------------------------------------------/
2538	case OP_EXACTI:
2539	case OP_NOTEXACTI:
2540	caseless = TRUE;
2541	codevalue -= OP_STARI - OP_STAR;
2542	/ Fall through /
2543	case OP_EXACT:
2544	case OP_NOTEXACT:
2545	count = current_state->count; / Number already matched /
2546	if (clen > `0`)
2547	{
2548	uint32_t otherd = NOTACHAR;
2549	if (caseless)
2550	{
2551	#ifdef SUPPORT_UNICODE
2552	if (utf_or_ucp && d >= `128`)
2553	otherd = UCD_OTHERCASE(d);
2554	else
2555	#endif /* SUPPORT_UNICODE */
2556	otherd = TABLE_GET(d, fcc, d);
2557	}
2558	if ((c == d \|\| c == otherd) == (codevalue < OP_NOTSTAR))
2559	{
2560	if (++count >= (int)GET2(code, `1`))
2561	{ ADD_NEW(state_offset + dlen + `1` + IMM2_SIZE, `0`); }
2562	else
2563	{ ADD_NEW(state_offset, count); }
2564	}
2565	}
2566	break;
2567
2568	/-----------------------------------------------------------------/
2569	case OP_UPTOI:
2570	case OP_MINUPTOI:
2571	case OP_POSUPTOI:
2572	case OP_NOTUPTOI:
2573	case OP_NOTMINUPTOI:
2574	case OP_NOTPOSUPTOI:
2575	caseless = TRUE;
2576	codevalue -= OP_STARI - OP_STAR;
2577	/ Fall through /
2578	case OP_UPTO:
2579	case OP_MINUPTO:
2580	case OP_POSUPTO:
2581	case OP_NOTUPTO:
2582	case OP_NOTMINUPTO:
2583	case OP_NOTPOSUPTO:
2584	ADD_ACTIVE(state_offset + dlen + `1` + IMM2_SIZE, `0`);
2585	count = current_state->count; / Number already matched /
2586	if (clen > `0`)
2587	{
2588	uint32_t otherd = NOTACHAR;
2589	if (caseless)
2590	{
2591	#ifdef SUPPORT_UNICODE
2592	if (utf_or_ucp && d >= `128`)
2593	otherd = UCD_OTHERCASE(d);
2594	else
2595	#endif /* SUPPORT_UNICODE */
2596	otherd = TABLE_GET(d, fcc, d);
2597	}
2598	if ((c == d \|\| c == otherd) == (codevalue < OP_NOTSTAR))
2599	{
2600	if (codevalue == OP_POSUPTO \|\| codevalue == OP_NOTPOSUPTO)
2601	{
2602	active_count--; / Remove non-match possibility /
2603	next_active_state--;
2604	}
2605	if (++count >= (int)GET2(code, `1`))
2606	{ ADD_NEW(state_offset + dlen + `1` + IMM2_SIZE, `0`); }
2607	else
2608	{ ADD_NEW(state_offset, count); }
2609	}
2610	}
2611	break;
2612
2613
2614	/ ========================================================================== /
2615	/ These are the class-handling opcodes /
2616
2617	case OP_CLASS:
2618	case OP_NCLASS:
2619	case OP_XCLASS:
2620	{
2621	BOOL isinclass = FALSE;
2622	int next_state_offset;
2623	PCRE2_SPTR ecode;
2624
2625	/ For a simple class, there is always just a 32-byte table, and we*
2626	can set isinclass from it. /*
2627
2628	if (codevalue != OP_XCLASS)
2629	{
2630	ecode = code + `1` + (`32` / sizeof(PCRE2_UCHAR));
2631	if (clen > `0`)
2632	{
2633	isinclass = (c > `255`)? (codevalue == OP_NCLASS) :
2634	((((uint8_t *)(code + `1`))[c/`8`] & (`1u` << (c&`7`))) != `0`);
2635	}
2636	}
2637
2638	/ An extended class may have a table or a list of single characters,*
2639	ranges, or both, and it may be positive or negative. There's a
2640	function that sorts all this out. /*
2641
2642	else
2643	{
2644	ecode = code + GET(code, `1`);
2645	if (clen > `0`) isinclass = PRIV(xclass)(c, code + `1` + LINK_SIZE, utf);
2646	}
2647
2648	/ At this point, isinclass is set for all kinds of class, and ecode*
2649	points to the byte after the end of the class. If there is a
2650	quantifier, this is where it will be. /*
2651
2652	next_state_offset = (int)(ecode - start_code);
2653
2654	switch (*ecode)
2655	{
2656	case OP_CRSTAR:
2657	case OP_CRMINSTAR:
2658	case OP_CRPOSSTAR:
2659	ADD_ACTIVE(next_state_offset + `1`, `0`);
2660	if (isinclass)
2661	{
2662	if (*ecode == OP_CRPOSSTAR)
2663	{
2664	active_count--; / Remove non-match possibility /
2665	next_active_state--;
2666	}
2667	ADD_NEW(state_offset, `0`);
2668	}
2669	break;
2670
2671	case OP_CRPLUS:
2672	case OP_CRMINPLUS:
2673	case OP_CRPOSPLUS:
2674	count = current_state->count; / Already matched /
2675	if (count > `0`) { ADD_ACTIVE(next_state_offset + `1`, `0`); }
2676	if (isinclass)
2677	{
2678	if (count > `0` && *ecode == OP_CRPOSPLUS)
2679	{
2680	active_count--; / Remove non-match possibility /
2681	next_active_state--;
2682	}
2683	count++;
2684	ADD_NEW(state_offset, count);
2685	}
2686	break;
2687
2688	case OP_CRQUERY:
2689	case OP_CRMINQUERY:
2690	case OP_CRPOSQUERY:
2691	ADD_ACTIVE(next_state_offset + `1`, `0`);
2692	if (isinclass)
2693	{
2694	if (*ecode == OP_CRPOSQUERY)
2695	{
2696	active_count--; / Remove non-match possibility /
2697	next_active_state--;
2698	}
2699	ADD_NEW(next_state_offset + `1`, `0`);
2700	}
2701	break;
2702
2703	case OP_CRRANGE:
2704	case OP_CRMINRANGE:
2705	case OP_CRPOSRANGE:
2706	count = current_state->count; / Already matched /
2707	if (count >= (int)GET2(ecode, `1`))
2708	{ ADD_ACTIVE(next_state_offset + `1` + `2` * IMM2_SIZE, `0`); }
2709	if (isinclass)
2710	{
2711	int max = (int)GET2(ecode, `1` + IMM2_SIZE);
2712
2713	if (ecode == OP_CRPOSRANGE && count >= (int*)GET2(ecode, `1`))
2714	{
2715	active_count--; / Remove non-match possibility /
2716	next_active_state--;
2717	}
2718
2719	if (++count >= max && max != `0`) / Max 0 => no limit /
2720	{ ADD_NEW(next_state_offset + `1` + `2` * IMM2_SIZE, `0`); }
2721	else
2722	{ ADD_NEW(state_offset, count); }
2723	}
2724	break;
2725
2726	default:
2727	if (isinclass) { ADD_NEW(next_state_offset, `0`); }
2728	break;
2729	}
2730	}
2731	break;
2732
2733	/ ========================================================================== /
2734	/ These are the opcodes for fancy brackets of various kinds. We have*
2735	to use recursion in order to handle them. The "always failing" assertion
2736	(?!) is optimised to OP_FAIL when compiling, so we have to support that,
2737	though the other "backtracking verbs" are not supported. /*
2738
2739	case OP_FAIL:
2740	forced_fail++; / Count FAILs for multiple states /
2741	break;
2742
2743	case OP_ASSERT:
2744	case OP_ASSERT_NOT:
2745	case OP_ASSERTBACK:
2746	case OP_ASSERTBACK_NOT:
2747	{
2748	int rc;
2749	int *local_workspace;
2750	PCRE2_SIZE *local_offsets;
2751	PCRE2_SPTR endasscode = code + GET(code, `1`);
2752	RWS_anchor rws = (RWS_anchor )RWS;
2753
2754	if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2755	{
2756	rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2757	if (rc != `0`) return rc;
2758	RWS = (int *)rws;
2759	}
2760
2761	local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2762	local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2763	rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2764
2765	while (*endasscode == OP_ALT) endasscode += GET(endasscode, `1`);
2766
2767	rc = internal_dfa_match(
2768	mb, / static match data /
2769	code, / this subexpression's code /
2770	ptr, / where we currently are /
2771	(PCRE2_SIZE)(ptr - start_subject), / start offset /
2772	local_offsets, / offset vector /
2773	RWS_OVEC_OSIZE/OVEC_UNIT, / size of same /
2774	local_workspace, / workspace vector /
2775	RWS_RSIZE, / size of same /
2776	rlevel, / function recursion level /
2777	RWS); / recursion workspace /
2778
2779	rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2780
2781	if (rc < `0` && rc != PCRE2_ERROR_NOMATCH) return rc;
2782	if ((rc >= `0`) == (codevalue == OP_ASSERT \|\| codevalue == OP_ASSERTBACK))
2783	{ ADD_ACTIVE((int)(endasscode + LINK_SIZE + `1` - start_code), `0`); }
2784	}
2785	break;
2786
2787	/-----------------------------------------------------------------/
2788	case OP_COND:
2789	case OP_SCOND:
2790	{
2791	int codelink = (int)GET(code, `1`);
2792	PCRE2_UCHAR condcode;
2793
2794	/ Because of the way auto-callout works during compile, a callout item*
2795	is inserted between OP_COND and an assertion condition. This does not
2796	happen for the other conditions. /*
2797
2798	if (code[LINK_SIZE + `1`] == OP_CALLOUT
2799	\|\| code[LINK_SIZE + `1`] == OP_CALLOUT_STR)
2800	{
2801	PCRE2_SIZE callout_length;
2802	rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb,
2803	`1` + LINK_SIZE, &callout_length);
2804	if (rrc < `0`) return rrc; / Abandon /
2805	if (rrc > `0`) break; / Fail this thread /
2806	code += callout_length; / Skip callout data /
2807	}
2808
2809	condcode = code[LINK_SIZE+`1`];
2810
2811	/ Back reference conditions and duplicate named recursion conditions*
2812	are not supported /*
2813
2814	if (condcode == OP_CREF \|\| condcode == OP_DNCREF \|\|
2815	condcode == OP_DNRREF)
2816	return PCRE2_ERROR_DFA_UCOND;
2817
2818	/ The DEFINE condition is always false, and the assertion (?!) is*
2819	converted to OP_FAIL. /*
2820
2821	if (condcode == OP_FALSE \|\| condcode == OP_FAIL)
2822	{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + `1`, `0`); }
2823
2824	/ There is also an always-true condition /
2825
2826	else if (condcode == OP_TRUE)
2827	{ ADD_ACTIVE(state_offset + LINK_SIZE + `2`, `0`); }
2828
2829	/ The only supported version of OP_RREF is for the value RREF_ANY,*
2830	which means "test if in any recursion". We can't test for specifically
2831	recursed groups. /*
2832
2833	else if (condcode == OP_RREF)
2834	{
2835	unsigned int value = GET2(code, LINK_SIZE + `2`);
2836	if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2837	if (mb->recursive != NULL)
2838	{ ADD_ACTIVE(state_offset + LINK_SIZE + `2` + IMM2_SIZE, `0`); }
2839	else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + `1`, `0`); }
2840	}
2841
2842	/ Otherwise, the condition is an assertion /
2843
2844	else
2845	{
2846	int rc;
2847	int *local_workspace;
2848	PCRE2_SIZE *local_offsets;
2849	PCRE2_SPTR asscode = code + LINK_SIZE + `1`;
2850	PCRE2_SPTR endasscode = asscode + GET(asscode, `1`);
2851	RWS_anchor rws = (RWS_anchor )RWS;
2852
2853	if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2854	{
2855	rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2856	if (rc != `0`) return rc;
2857	RWS = (int *)rws;
2858	}
2859
2860	local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2861	local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2862	rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2863
2864	while (*endasscode == OP_ALT) endasscode += GET(endasscode, `1`);
2865
2866	rc = internal_dfa_match(
2867	mb, / fixed match data /
2868	asscode, / this subexpression's code /
2869	ptr, / where we currently are /
2870	(PCRE2_SIZE)(ptr - start_subject), / start offset /
2871	local_offsets, / offset vector /
2872	RWS_OVEC_OSIZE/OVEC_UNIT, / size of same /
2873	local_workspace, / workspace vector /
2874	RWS_RSIZE, / size of same /
2875	rlevel, / function recursion level /
2876	RWS); / recursion workspace /
2877
2878	rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2879
2880	if (rc < `0` && rc != PCRE2_ERROR_NOMATCH) return rc;
2881	if ((rc >= `0`) ==
2882	(condcode == OP_ASSERT \|\| condcode == OP_ASSERTBACK))
2883	{ ADD_ACTIVE((int)(endasscode + LINK_SIZE + `1` - start_code), `0`); }
2884	else
2885	{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + `1`, `0`); }
2886	}
2887	}
2888	break;
2889
2890	/-----------------------------------------------------------------/
2891	case OP_RECURSE:
2892	{
2893	int rc;
2894	int *local_workspace;
2895	PCRE2_SIZE *local_offsets;
2896	RWS_anchor rws = (RWS_anchor )RWS;
2897	dfa_recursion_info *ri;
2898	PCRE2_SPTR callpat = start_code + GET(code, `1`);
2899	uint32_t recno = (callpat == mb->start_code)? `0` :
2900	GET2(callpat, `1` + LINK_SIZE);
2901
2902	if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2903	{
2904	rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2905	if (rc != `0`) return rc;
2906	RWS = (int *)rws;
2907	}
2908
2909	local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2910	local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2911	rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2912
2913	/ Check for repeating a recursion without advancing the subject*
2914	pointer. This should catch convoluted mutual recursions. (Some simple
2915	cases are caught at compile time.) /*
2916
2917	for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
2918	if (recno == ri->group_num && ptr == ri->subject_position)
2919	return PCRE2_ERROR_RECURSELOOP;
2920
2921	/ Remember this recursion and where we started it so as to*
2922	catch infinite loops. /*
2923
2924	new_recursive.group_num = recno;
2925	new_recursive.subject_position = ptr;
2926	new_recursive.prevrec = mb->recursive;
2927	mb->recursive = &new_recursive;
2928
2929	rc = internal_dfa_match(
2930	mb, / fixed match data /
2931	callpat, / this subexpression's code /
2932	ptr, / where we currently are /
2933	(PCRE2_SIZE)(ptr - start_subject), / start offset /
2934	local_offsets, / offset vector /
2935	RWS_OVEC_RSIZE/OVEC_UNIT, / size of same /
2936	local_workspace, / workspace vector /
2937	RWS_RSIZE, / size of same /
2938	rlevel, / function recursion level /
2939	RWS); / recursion workspace /
2940
2941	rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2942	mb->recursive = new_recursive.prevrec; / Done this recursion /
2943
2944	/ Ran out of internal offsets /
2945
2946	if (rc == `0`) return PCRE2_ERROR_DFA_RECURSE;
2947
2948	/ For each successful matched substring, set up the next state with a*
2949	count of characters to skip before trying it. Note that the count is in
2950	characters, not bytes. /*
2951
2952	if (rc > `0`)
2953	{
2954	for (rc = rc*`2` - `2`; rc >= `0`; rc -= `2`)
2955	{
2956	PCRE2_SIZE charcount = local_offsets[rc+`1`] - local_offsets[rc];
2957	#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2958	if (utf)
2959	{
2960	PCRE2_SPTR p = start_subject + local_offsets[rc];
2961	PCRE2_SPTR pp = start_subject + local_offsets[rc+`1`];
2962	while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2963	}
2964	#endif
2965	if (charcount > `0`)
2966	{
2967	ADD_NEW_DATA(-(state_offset + LINK_SIZE + `1`), `0`,
2968	(int)(charcount - `1`));
2969	}
2970	else
2971	{
2972	ADD_ACTIVE(state_offset + LINK_SIZE + `1`, `0`);
2973	}
2974	}
2975	}
2976	else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2977	}
2978	break;
2979
2980	/-----------------------------------------------------------------/
2981	case OP_BRAPOS:
2982	case OP_SBRAPOS:
2983	case OP_CBRAPOS:
2984	case OP_SCBRAPOS:
2985	case OP_BRAPOSZERO:
2986	{
2987	int rc;
2988	int *local_workspace;
2989	PCRE2_SIZE *local_offsets;
2990	PCRE2_SIZE charcount, matched_count;
2991	PCRE2_SPTR local_ptr = ptr;
2992	RWS_anchor rws = (RWS_anchor )RWS;
2993	BOOL allow_zero;
2994
2995	if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2996	{
2997	rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2998	if (rc != `0`) return rc;
2999	RWS = (int *)rws;
3000	}
3001
3002	local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3003	local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3004	rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3005
3006	if (codevalue == OP_BRAPOSZERO)
3007	{
3008	allow_zero = TRUE;
3009	codevalue = (++code); /* Codevalue will be one of above BRAs /
3010	}
3011	else allow_zero = FALSE;
3012
3013	/ Loop to match the subpattern as many times as possible as if it were*
3014	a complete pattern. /*
3015
3016	for (matched_count = `0`;; matched_count++)
3017	{
3018	rc = internal_dfa_match(
3019	mb, / fixed match data /
3020	code, / this subexpression's code /
3021	local_ptr, / where we currently are /
3022	(PCRE2_SIZE)(ptr - start_subject), / start offset /
3023	local_offsets, / offset vector /
3024	RWS_OVEC_OSIZE/OVEC_UNIT, / size of same /
3025	local_workspace, / workspace vector /
3026	RWS_RSIZE, / size of same /
3027	rlevel, / function recursion level /
3028	RWS); / recursion workspace /
3029
3030	/ Failed to match /
3031
3032	if (rc < `0`)
3033	{
3034	if (rc != PCRE2_ERROR_NOMATCH) return rc;
3035	break;
3036	}
3037
3038	/ Matched: break the loop if zero characters matched. /
3039
3040	charcount = local_offsets[`1`] - local_offsets[`0`];
3041	if (charcount == `0`) break;
3042	local_ptr += charcount; / Advance temporary position ptr /
3043	}
3044
3045	rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3046
3047	/ At this point we have matched the subpattern matched_count*
3048	times, and local_ptr is pointing to the character after the end of the
3049	last match. /*
3050
3051	if (matched_count > `0` \|\| allow_zero)
3052	{
3053	PCRE2_SPTR end_subpattern = code;
3054	int next_state_offset;
3055
3056	do { end_subpattern += GET(end_subpattern, `1`); }
3057	while (*end_subpattern == OP_ALT);
3058	next_state_offset =
3059	(int)(end_subpattern - start_code + LINK_SIZE + `1`);
3060
3061	/ Optimization: if there are no more active states, and there*
3062	are no new states yet set up, then skip over the subject string
3063	right here, to save looping. Otherwise, set up the new state to swing
3064	into action when the end of the matched substring is reached. /*
3065
3066	if (i + `1` >= active_count && new_count == `0`)
3067	{
3068	ptr = local_ptr;
3069	clen = `0`;
3070	ADD_NEW(next_state_offset, `0`);
3071	}
3072	else
3073	{
3074	PCRE2_SPTR p = ptr;
3075	PCRE2_SPTR pp = local_ptr;
3076	charcount = (PCRE2_SIZE)(pp - p);
3077	#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3078	if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3079	#endif
3080	ADD_NEW_DATA(-next_state_offset, `0`, (int)(charcount - `1`));
3081	}
3082	}
3083	}
3084	break;
3085
3086	/-----------------------------------------------------------------/
3087	case OP_ONCE:
3088	{
3089	int rc;
3090	int *local_workspace;
3091	PCRE2_SIZE *local_offsets;
3092	RWS_anchor rws = (RWS_anchor )RWS;
3093
3094	if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3095	{
3096	rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3097	if (rc != `0`) return rc;
3098	RWS = (int *)rws;
3099	}
3100
3101	local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3102	local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3103	rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3104
3105	rc = internal_dfa_match(
3106	mb, / fixed match data /
3107	code, / this subexpression's code /
3108	ptr, / where we currently are /
3109	(PCRE2_SIZE)(ptr - start_subject), / start offset /
3110	local_offsets, / offset vector /
3111	RWS_OVEC_OSIZE/OVEC_UNIT, / size of same /
3112	local_workspace, / workspace vector /
3113	RWS_RSIZE, / size of same /
3114	rlevel, / function recursion level /
3115	RWS); / recursion workspace /
3116
3117	rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3118
3119	if (rc >= `0`)
3120	{
3121	PCRE2_SPTR end_subpattern = code;
3122	PCRE2_SIZE charcount = local_offsets[`1`] - local_offsets[`0`];
3123	int next_state_offset, repeat_state_offset;
3124
3125	do { end_subpattern += GET(end_subpattern, `1`); }
3126	while (*end_subpattern == OP_ALT);
3127	next_state_offset =
3128	(int)(end_subpattern - start_code + LINK_SIZE + `1`);
3129
3130	/ If the end of this subpattern is KETRMAX or KETRMIN, we must*
3131	arrange for the repeat state also to be added to the relevant list.
3132	Calculate the offset, or set -1 for no repeat. /*
3133
3134	repeat_state_offset = (*end_subpattern == OP_KETRMAX \|\|
3135	*end_subpattern == OP_KETRMIN)?
3136	(int)(end_subpattern - start_code - GET(end_subpattern, `1`)) : -`1`;
3137
3138	/ If we have matched an empty string, add the next state at the*
3139	current character pointer. This is important so that the duplicate
3140	checking kicks in, which is what breaks infinite loops that match an
3141	empty string. /*
3142
3143	if (charcount == `0`)
3144	{
3145	ADD_ACTIVE(next_state_offset, `0`);
3146	}
3147
3148	/ Optimization: if there are no more active states, and there*
3149	are no new states yet set up, then skip over the subject string
3150	right here, to save looping. Otherwise, set up the new state to swing
3151	into action when the end of the matched substring is reached. /*
3152
3153	else if (i + `1` >= active_count && new_count == `0`)
3154	{
3155	ptr += charcount;
3156	clen = `0`;
3157	ADD_NEW(next_state_offset, `0`);
3158
3159	/ If we are adding a repeat state at the new character position,*
3160	we must fudge things so that it is the only current state.
3161	Otherwise, it might be a duplicate of one we processed before, and
3162	that would cause it to be skipped. /*
3163
3164	if (repeat_state_offset >= `0`)
3165	{
3166	next_active_state = active_states;
3167	active_count = `0`;
3168	i = -`1`;
3169	ADD_ACTIVE(repeat_state_offset, `0`);
3170	}
3171	}
3172	else
3173	{
3174	#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3175	if (utf)
3176	{
3177	PCRE2_SPTR p = start_subject + local_offsets[`0`];
3178	PCRE2_SPTR pp = start_subject + local_offsets[`1`];
3179	while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3180	}
3181	#endif
3182	ADD_NEW_DATA(-next_state_offset, `0`, (int)(charcount - `1`));
3183	if (repeat_state_offset >= `0`)
3184	{ ADD_NEW_DATA(-repeat_state_offset, `0`, (int)(charcount - `1`)); }
3185	}
3186	}
3187	else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3188	}
3189	break;
3190
3191
3192	/ ========================================================================== /
3193	/ Handle callouts /
3194
3195	case OP_CALLOUT:
3196	case OP_CALLOUT_STR:
3197	{
3198	PCRE2_SIZE callout_length;
3199	rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb, `0`,
3200	&callout_length);
3201	if (rrc < `0`) return rrc; / Abandon /
3202	if (rrc == `0`)
3203	{ ADD_ACTIVE(state_offset + (int)callout_length, `0`); }
3204	}
3205	break;
3206
3207
3208	/ ========================================================================== /
3209	default: / Unsupported opcode /
3210	return PCRE2_ERROR_DFA_UITEM;
3211	}
3212
3213	NEXT_ACTIVE_STATE: continue;
3214
3215	} / End of loop scanning active states /
3216
3217	/ We have finished the processing at the current subject character. If no*
3218	new states have been set for the next character, we have found all the
3219	matches that we are going to find. If partial matching has been requested,
3220	check for appropriate conditions.
3221
3222	The "forced_ fail" variable counts the number of (F) encountered for the*
3223	character. If it is equal to the original active_count (saved in
3224	workspace[1]) it means that (F) was found on every active state. In this*
3225	case we don't want to give a partial match.
3226
3227	The "could_continue" variable is true if a state could have continued but
3228	for the fact that the end of the subject was reached. /*
3229
3230	if (new_count <= `0`)
3231	{
3232	if (could_continue && / Some could go on, and /
3233	forced_fail != workspace[`1`] && / Not all forced fail & /
3234	( / either... /
3235	(mb->moptions & PCRE2_PARTIAL_HARD) != `0` / Hard partial /
3236	\|\| / or... /
3237	((mb->moptions & PCRE2_PARTIAL_SOFT) != `0` && / Soft partial and /
3238	match_count < `0`) / no matches /
3239	) && / And... /
3240	(
3241	partial_newline \|\| / Either partial NL /
3242	( / or ... /
3243	ptr >= end_subject && / End of subject and /
3244	( / either /
3245	ptr > mb->start_used_ptr \|\| / Inspected non-empty string /
3246	mb->allowemptypartial / or pattern has lookbehind /
3247	) / or could match empty /
3248	)
3249	))
3250	match_count = PCRE2_ERROR_PARTIAL;
3251	break; / Exit from loop along the subject string /
3252	}
3253
3254	/ One or more states are active for the next character. /
3255
3256	ptr += clen; / Advance to next subject character /
3257	} / Loop to move along the subject string /
3258
3259	/ Control gets here from "break" a few lines above. If we have a match and*
3260	PCRE2_ENDANCHORED is set, the match fails. /*
3261
3262	if (match_count >= `0` &&
3263	((mb->moptions \| mb->poptions) & PCRE2_ENDANCHORED) != `0` &&
3264	ptr < end_subject)
3265	match_count = PCRE2_ERROR_NOMATCH;
3266
3267	return match_count;
3268	}
3269
3270
3271
3272	/*************************************************
3273	* Match a pattern using the DFA algorithm *
3274	*************************************************/
3275
3276	/ This function matches a compiled pattern to a subject string, using the*
3277	alternate matching algorithm that finds all matches at once.
3278
3279	Arguments:
3280	code points to the compiled pattern
3281	subject subject string
3282	length length of subject string
3283	startoffset where to start matching in the subject
3284	options option bits
3285	match_data points to a match data structure
3286	gcontext points to a match context
3287	workspace pointer to workspace
3288	wscount size of workspace
3289
3290	Returns: > 0 => number of match offset pairs placed in offsets
3291	= 0 => offsets overflowed; longest matches are present
3292	-1 => failed to match
3293	< -1 => some kind of unexpected problem
3294	*/
3295
3296	PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
3297	pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3298	PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3299	pcre2_match_context mcontext, int* *workspace, PCRE2_SIZE wscount)
3300	{
3301	int rc;
3302	int was_zero_terminated = `0`;
3303
3304	const pcre2_real_code re = (const* pcre2_real_code *)code;
3305
3306	PCRE2_SPTR start_match;
3307	PCRE2_SPTR end_subject;
3308	PCRE2_SPTR bumpalong_limit;
3309	PCRE2_SPTR req_cu_ptr;
3310
3311	BOOL utf, anchored, startline, firstline;
3312	BOOL has_first_cu = FALSE;
3313	BOOL has_req_cu = FALSE;
3314
3315	#if PCRE2_CODE_UNIT_WIDTH == 8
3316	PCRE2_SPTR memchr_found_first_cu = NULL;
3317	PCRE2_SPTR memchr_found_first_cu2 = NULL;
3318	#endif
3319
3320	PCRE2_UCHAR first_cu = `0`;
3321	PCRE2_UCHAR first_cu2 = `0`;
3322	PCRE2_UCHAR req_cu = `0`;
3323	PCRE2_UCHAR req_cu2 = `0`;
3324
3325	const uint8_t *start_bits = NULL;
3326
3327	/ We need to have mb pointing to a match block, because the IS_NEWLINE macro*
3328	is used below, and it expects NLBLOCK to be defined as a pointer. /*
3329
3330	pcre2_callout_block cb;
3331	dfa_match_block actual_match_block;
3332	dfa_match_block *mb = &actual_match_block;
3333
3334	/ Set up a starting block of memory for use during recursive calls to*
3335	internal_dfa_match(). By putting this on the stack, it minimizes resource use
3336	in the case when it is not needed. If this is too small, more memory is
3337	obtained from the heap. At the start of each block is an anchor structure./*
3338
3339	int base_recursion_workspace[RWS_BASE_SIZE];
3340	RWS_anchor rws = (RWS_anchor )base_recursion_workspace;
3341	rws->next = NULL;
3342	rws->size = RWS_BASE_SIZE;
3343	rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3344
3345	/ Recognize NULL, length 0 as an empty string. /
3346
3347	if (subject == NULL && length == `0`) subject = (PCRE2_SPTR)"";
3348
3349	/ Plausibility checks /
3350
3351	if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != `0`) return PCRE2_ERROR_BADOPTION;
3352	if (re == NULL \|\| subject == NULL \|\| workspace == NULL \|\| match_data == NULL)
3353	return PCRE2_ERROR_NULL;
3354
3355	if (length == PCRE2_ZERO_TERMINATED)
3356	{
3357	length = PRIV(strlen)(subject);
3358	was_zero_terminated = `1`;
3359	}
3360
3361	if (wscount < `20`) return PCRE2_ERROR_DFA_WSSIZE;
3362	if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3363
3364	/ Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same*
3365	time. /*
3366
3367	if ((options & (PCRE2_PARTIAL_HARD\|PCRE2_PARTIAL_SOFT)) != `0` &&
3368	((re->overall_options \| options) & PCRE2_ENDANCHORED) != `0`)
3369	return PCRE2_ERROR_BADOPTION;
3370
3371	/ Invalid UTF support is not available for DFA matching. /
3372
3373	if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != `0`)
3374	return PCRE2_ERROR_DFA_UINVALID_UTF;
3375
3376	/ Check that the first field in the block is the magic number. If it is not,*
3377	return with PCRE2_ERROR_BADMAGIC. /*
3378
3379	if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3380
3381	/ Check the code unit width. /
3382
3383	if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/`8`)
3384	return PCRE2_ERROR_BADMODE;
3385
3386	/ PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the*
3387	options variable for this function. Users of PCRE2 who are not calling the
3388	function directly would like to have a way of setting these flags, in the same
3389	way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3390	constructions like (NO_AUTOPOSSESS). To enable this, (NOTEMPTY) and
3391	(NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be*
3392	transferred to the options for this function. The bits are guaranteed to be
3393	adjacent, but do not have the same values. This bit of Boolean trickery assumes
3394	that the match-time bits are not more significant than the flag bits. If by
3395	accident this is not the case, a compile-time division by zero error will
3396	occur. /*
3397
3398	#define FF (PCRE2_NOTEMPTY_SET\|PCRE2_NE_ATST_SET)
3399	#define OO (PCRE2_NOTEMPTY\|PCRE2_NOTEMPTY_ATSTART)
3400	options \|= (re->flags & FF) / ((FF & (~FF+`1`)) / (OO & (~OO+`1`)));
3401	#undef FF
3402	#undef OO
3403
3404	/ If restarting after a partial match, do some sanity checks on the contents*
3405	of the workspace. /*
3406
3407	if ((options & PCRE2_DFA_RESTART) != `0`)
3408	{
3409	if ((workspace[`0`] & (-`2`)) != `0` \|\| workspace[`1`] < `1` \|\|
3410	workspace[`1`] > (int)((wscount - `2`)/INTS_PER_STATEBLOCK))
3411	return PCRE2_ERROR_DFA_BADRESTART;
3412	}
3413
3414	/ Set some local values /
3415
3416	utf = (re->overall_options & PCRE2_UTF) != `0`;
3417	start_match = subject + start_offset;
3418	end_subject = subject + length;
3419	req_cu_ptr = start_match - `1`;
3420	anchored = (options & (PCRE2_ANCHORED\|PCRE2_DFA_RESTART)) != `0` \|\|
3421	(re->overall_options & PCRE2_ANCHORED) != `0`;
3422
3423	/ The "must be at the start of a line" flags are used in a loop when finding*
3424	where to start. /*
3425
3426	startline = (re->flags & PCRE2_STARTLINE) != `0`;
3427	firstline = (re->overall_options & PCRE2_FIRSTLINE) != `0`;
3428	bumpalong_limit = end_subject;
3429
3430	/ Initialize and set up the fixed fields in the callout block, with a pointer*
3431	in the match block. /*
3432
3433	mb->cb = &cb;
3434	cb.version = `2`;
3435	cb.subject = subject;
3436	cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3437	cb.callout_flags = `0`;
3438	cb.capture_top = `1`; / No capture support /
3439	cb.capture_last = `0`;
3440	cb.mark = NULL; / No (MARK) support /*
3441
3442	/ Get data from the match context, if present, and fill in the remaining*
3443	fields in the match block. It is an error to set an offset limit without
3444	setting the flag at compile time. /*
3445
3446	if (mcontext == NULL)
3447	{
3448	mb->callout = NULL;
3449	mb->memctl = re->memctl;
3450	mb->match_limit = PRIV(default_match_context).match_limit;
3451	mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3452	mb->heap_limit = PRIV(default_match_context).heap_limit;
3453	}
3454	else
3455	{
3456	if (mcontext->offset_limit != PCRE2_UNSET)
3457	{
3458	if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == `0`)
3459	return PCRE2_ERROR_BADOFFSETLIMIT;
3460	bumpalong_limit = subject + mcontext->offset_limit;
3461	}
3462	mb->callout = mcontext->callout;
3463	mb->callout_data = mcontext->callout_data;
3464	mb->memctl = mcontext->memctl;
3465	mb->match_limit = mcontext->match_limit;
3466	mb->match_limit_depth = mcontext->depth_limit;
3467	mb->heap_limit = mcontext->heap_limit;
3468	}
3469
3470	if (mb->match_limit > re->limit_match)
3471	mb->match_limit = re->limit_match;
3472
3473	if (mb->match_limit_depth > re->limit_depth)
3474	mb->match_limit_depth = re->limit_depth;
3475
3476	if (mb->heap_limit > re->limit_heap)
3477	mb->heap_limit = re->limit_heap;
3478
3479	mb->start_code = (PCRE2_UCHAR )((uint8_t )re + sizeof(pcre2_real_code)) +
3480	re->name_count * re->name_entry_size;
3481	mb->tables = re->tables;
3482	mb->start_subject = subject;
3483	mb->end_subject = end_subject;
3484	mb->start_offset = start_offset;
3485	mb->allowemptypartial = (re->max_lookbehind > `0`) \|\|
3486	(re->flags & PCRE2_MATCH_EMPTY) != `0`;
3487	mb->moptions = options;
3488	mb->poptions = re->overall_options;
3489	mb->match_call_count = `0`;
3490	mb->heap_used = `0`;
3491
3492	/ Process the \R and newline settings. /
3493
3494	mb->bsr_convention = re->bsr_convention;
3495	mb->nltype = NLTYPE_FIXED;
3496	switch(re->newline_convention)
3497	{
3498	case PCRE2_NEWLINE_CR:
3499	mb->nllen = `1`;
3500	mb->nl[`0`] = CHAR_CR;
3501	break;
3502
3503	case PCRE2_NEWLINE_LF:
3504	mb->nllen = `1`;
3505	mb->nl[`0`] = CHAR_NL;
3506	break;
3507
3508	case PCRE2_NEWLINE_NUL:
3509	mb->nllen = `1`;
3510	mb->nl[`0`] = CHAR_NUL;
3511	break;
3512
3513	case PCRE2_NEWLINE_CRLF:
3514	mb->nllen = `2`;
3515	mb->nl[`0`] = CHAR_CR;
3516	mb->nl[`1`] = CHAR_NL;
3517	break;
3518
3519	case PCRE2_NEWLINE_ANY:
3520	mb->nltype = NLTYPE_ANY;
3521	break;
3522
3523	case PCRE2_NEWLINE_ANYCRLF:
3524	mb->nltype = NLTYPE_ANYCRLF;
3525	break;
3526
3527	default: return PCRE2_ERROR_INTERNAL;
3528	}
3529
3530	/ Check a UTF string for validity if required. For 8-bit and 16-bit strings,*
3531	we must also check that a starting offset does not point into the middle of a
3532	multiunit character. We check only the portion of the subject that is going to
3533	be inspected during matching - from the offset minus the maximum back reference
3534	to the given length. This saves time when a small part of a large subject is
3535	being matched by the use of a starting offset. Note that the maximum lookbehind
3536	is a number of characters, not code units. /*
3537
3538	#ifdef SUPPORT_UNICODE
3539	if (utf && (options & PCRE2_NO_UTF_CHECK) == `0`)
3540	{
3541	PCRE2_SPTR check_subject = start_match; / start_match includes offset /
3542
3543	if (start_offset > `0`)
3544	{
3545	#if PCRE2_CODE_UNIT_WIDTH != 32
3546	unsigned int i;
3547	if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3548	return PCRE2_ERROR_BADUTFOFFSET;
3549	for (i = re->max_lookbehind; i > `0` && check_subject > subject; i--)
3550	{
3551	check_subject--;
3552	while (check_subject > subject &&
3553	#if PCRE2_CODE_UNIT_WIDTH == 8
3554	(*check_subject & `0xc0`) == `0x80`)
3555	#else /* 16-bit */
3556	(*check_subject & `0xfc00`) == `0xdc00`)
3557	#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3558	check_subject--;
3559	}
3560	#else /* In the 32-bit library, one code unit equals one character. */
3561	check_subject -= re->max_lookbehind;
3562	if (check_subject < subject) check_subject = subject;
3563	#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
3564	}
3565
3566	/ Validate the relevant portion of the subject. After an error, adjust the*
3567	offset to be an absolute offset in the whole string. /*
3568
3569	match_data->rc = PRIV(valid_utf)(check_subject,
3570	length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3571	if (match_data->rc != `0`)
3572	{
3573	match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3574	return match_data->rc;
3575	}
3576	}
3577	#endif /* SUPPORT_UNICODE */
3578
3579	/ Set up the first code unit to match, if available. If there's no first code*
3580	unit there may be a bitmap of possible first characters. /*
3581
3582	if ((re->flags & PCRE2_FIRSTSET) != `0`)
3583	{
3584	has_first_cu = TRUE;
3585	first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3586	if ((re->flags & PCRE2_FIRSTCASELESS) != `0`)
3587	{
3588	first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3589	#ifdef SUPPORT_UNICODE
3590	#if PCRE2_CODE_UNIT_WIDTH == 8
3591	if (first_cu > `127` && !utf && (re->overall_options & PCRE2_UCP) != `0`)
3592	first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3593	#else
3594	if (first_cu > `127` && (utf \|\| (re->overall_options & PCRE2_UCP) != `0`))
3595	first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3596	#endif
3597	#endif /* SUPPORT_UNICODE */
3598	}
3599	}
3600	else
3601	if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != `0`)
3602	start_bits = re->start_bitmap;
3603
3604	/ There may be a "last known required code unit" set. /
3605
3606	if ((re->flags & PCRE2_LASTSET) != `0`)
3607	{
3608	has_req_cu = TRUE;
3609	req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3610	if ((re->flags & PCRE2_LASTCASELESS) != `0`)
3611	{
3612	req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3613	#ifdef SUPPORT_UNICODE
3614	#if PCRE2_CODE_UNIT_WIDTH == 8
3615	if (req_cu > `127` && !utf && (re->overall_options & PCRE2_UCP) != `0`)
3616	req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3617	#else
3618	if (req_cu > `127` && (utf \|\| (re->overall_options & PCRE2_UCP) != `0`))
3619	req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3620	#endif
3621	#endif /* SUPPORT_UNICODE */
3622	}
3623	}
3624
3625	/ If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,*
3626	free the memory that was obtained. /*
3627
3628	if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != `0`)
3629	{
3630	match_data->memctl.free((void *)match_data->subject,
3631	match_data->memctl.memory_data);
3632	match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
3633	}
3634
3635	/ Fill in fields that are always returned in the match data. /
3636
3637	match_data->code = re;
3638	match_data->subject = NULL; / Default for no match /
3639	match_data->mark = NULL;
3640	match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3641
3642	/ Call the main matching function, looping for a non-anchored regex after a*
3643	failed match. If not restarting, perform certain optimizations at the start of
3644	a match. /*
3645
3646	for (;;)
3647	{
3648	/ ----------------- Start of match optimizations ---------------- /
3649
3650	/ There are some optimizations that avoid running the match if a known*
3651	starting point is not found, or if a known later code unit is not present.
3652	However, there is an option (settable at compile time) that disables
3653	these, for testing and for ensuring that all callouts do actually occur.
3654	The optimizations must also be avoided when restarting a DFA match. /*
3655
3656	if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == `0` &&
3657	(options & PCRE2_DFA_RESTART) == `0`)
3658	{
3659	/ If firstline is TRUE, the start of the match is constrained to the first*
3660	line of a multiline string. That is, the match must be before or at the
3661	first newline following the start of matching. Temporarily adjust
3662	end_subject so that we stop the optimization scans for a first code unit
3663	immediately after the first character of a newline (the first code unit can
3664	legitimately be a newline). If the match fails at the newline, later code
3665	breaks this loop. /*
3666
3667	if (firstline)
3668	{
3669	PCRE2_SPTR t = start_match;
3670	#ifdef SUPPORT_UNICODE
3671	if (utf)
3672	{
3673	while (t < end_subject && !IS_NEWLINE(t))
3674	{
3675	t++;
3676	ACROSSCHAR(t < end_subject, t, t++);
3677	}
3678	}
3679	else
3680	#endif
3681	while (t < end_subject && !IS_NEWLINE(t)) t++;
3682	end_subject = t;
3683	}
3684
3685	/ Anchored: check the first code unit if one is recorded. This may seem*
3686	pointless but it can help in detecting a no match case without scanning for
3687	the required code unit. /*
3688
3689	if (anchored)
3690	{
3691	if (has_first_cu \|\| start_bits != NULL)
3692	{
3693	BOOL ok = start_match < end_subject;
3694	if (ok)
3695	{
3696	PCRE2_UCHAR c = UCHAR21TEST(start_match);
3697	ok = has_first_cu && (c == first_cu \|\| c == first_cu2);
3698	if (!ok && start_bits != NULL)
3699	{
3700	#if PCRE2_CODE_UNIT_WIDTH != 8
3701	if (c > `255`) c = `255`;
3702	#endif
3703	ok = (start_bits[c/`8`] & (`1u` << (c&`7`))) != `0`;
3704	}
3705	}
3706	if (!ok) break;
3707	}
3708	}
3709
3710	/ Not anchored. Advance to a unique first code unit if there is one. /
3711
3712	else
3713	{
3714	if (has_first_cu)
3715	{
3716	if (first_cu != first_cu2) / Caseless /
3717	{
3718	/ In 16-bit and 32_bit modes we have to do our own search, so can*
3719	look for both cases at once. /*
3720
3721	#if PCRE2_CODE_UNIT_WIDTH != 8
3722	PCRE2_UCHAR smc;
3723	while (start_match < end_subject &&
3724	(smc = UCHAR21TEST(start_match)) != first_cu &&
3725	smc != first_cu2)
3726	start_match++;
3727	#else
3728	/ In 8-bit mode, the use of memchr() gives a big speed up, even*
3729	though we have to call it twice in order to find the earliest
3730	occurrence of the code unit in either of its cases. Caching is used
3731	to remember the positions of previously found code units. This can
3732	make a huge difference when the strings are very long and only one
3733	case is actually present. /*
3734
3735	PCRE2_SPTR pp1 = NULL;
3736	PCRE2_SPTR pp2 = NULL;
3737	PCRE2_SIZE searchlength = end_subject - start_match;
3738
3739	/ If we haven't got a previously found position for first_cu, or if*
3740	the current starting position is later, we need to do a search. If
3741	the code unit is not found, set it to the end. /*
3742
3743	if (memchr_found_first_cu == NULL \|\|
3744	start_match > memchr_found_first_cu)
3745	{
3746	pp1 = memchr(start_match, first_cu, searchlength);
3747	memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
3748	}
3749
3750	/ If the start is before a previously found position, use the*
3751	previous position, or NULL if a previous search failed. /*
3752
3753	else pp1 = (memchr_found_first_cu == end_subject)? NULL :
3754	memchr_found_first_cu;
3755
3756	/ Do the same thing for the other case. /
3757
3758	if (memchr_found_first_cu2 == NULL \|\|
3759	start_match > memchr_found_first_cu2)
3760	{
3761	pp2 = memchr(start_match, first_cu2, searchlength);
3762	memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
3763	}
3764
3765	else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
3766	memchr_found_first_cu2;
3767
3768	/ Set the start to the end of the subject if neither case was found.*
3769	Otherwise, use the earlier found point. /*
3770
3771	if (pp1 == NULL)
3772	start_match = (pp2 == NULL)? end_subject : pp2;
3773	else
3774	start_match = (pp2 == NULL \|\| pp1 < pp2)? pp1 : pp2;
3775
3776	#endif /* 8-bit handling */
3777	}
3778
3779	/ The caseful case is much simpler. /
3780
3781	else
3782	{
3783	#if PCRE2_CODE_UNIT_WIDTH != 8
3784	while (start_match < end_subject && UCHAR21TEST(start_match) !=
3785	first_cu)
3786	start_match++;
3787	#else /* 8-bit code units */
3788	start_match = memchr(start_match, first_cu, end_subject - start_match);
3789	if (start_match == NULL) start_match = end_subject;
3790	#endif
3791	}
3792
3793	/ If we can't find the required code unit, having reached the true end*
3794	of the subject, break the bumpalong loop, to force a match failure,
3795	except when doing partial matching, when we let the next cycle run at
3796	the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3797	which partially matches "abc", even though the string does not contain
3798	the starting character "d". If we have not reached the true end of the
3799	subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3800	we also let the cycle run, because the matching string is legitimately
3801	allowed to start with the first code unit of a newline. /*
3802
3803	if ((mb->moptions & (PCRE2_PARTIAL_HARD\|PCRE2_PARTIAL_SOFT)) == `0` &&
3804	start_match >= mb->end_subject)
3805	break;
3806	}
3807
3808	/ If there's no first code unit, advance to just after a linebreak for a*
3809	multiline match if required. /*
3810
3811	else if (startline)
3812	{
3813	if (start_match > mb->start_subject + start_offset)
3814	{
3815	#ifdef SUPPORT_UNICODE
3816	if (utf)
3817	{
3818	while (start_match < end_subject && !WAS_NEWLINE(start_match))
3819	{
3820	start_match++;
3821	ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3822	}
3823	}
3824	else
3825	#endif
3826	while (start_match < end_subject && !WAS_NEWLINE(start_match))
3827	start_match++;
3828
3829	/ If we have just passed a CR and the newline option is ANY or*
3830	ANYCRLF, and we are now at a LF, advance the match position by one
3831	more code unit. /*
3832
3833	if (start_match[-`1`] == CHAR_CR &&
3834	(mb->nltype == NLTYPE_ANY \|\| mb->nltype == NLTYPE_ANYCRLF) &&
3835	start_match < end_subject &&
3836	UCHAR21TEST(start_match) == CHAR_NL)
3837	start_match++;
3838	}
3839	}
3840
3841	/ If there's no first code unit or a requirement for a multiline line*
3842	start, advance to a non-unique first code unit if any have been
3843	identified. The bitmap contains only 256 bits. When code units are 16 or
3844	32 bits wide, all code units greater than 254 set the 255 bit. /*
3845
3846	else if (start_bits != NULL)
3847	{
3848	while (start_match < end_subject)
3849	{
3850	uint32_t c = UCHAR21TEST(start_match);
3851	#if PCRE2_CODE_UNIT_WIDTH != 8
3852	if (c > `255`) c = `255`;
3853	#endif
3854	if ((start_bits[c/`8`] & (`1u` << (c&`7`))) != `0`) break;
3855	start_match++;
3856	}
3857
3858	/ See comment above in first_cu checking about the next line. /
3859
3860	if ((mb->moptions & (PCRE2_PARTIAL_HARD\|PCRE2_PARTIAL_SOFT)) == `0` &&
3861	start_match >= mb->end_subject)
3862	break;
3863	}
3864	} / End of first code unit handling /
3865
3866	/ Restore fudged end_subject /
3867
3868	end_subject = mb->end_subject;
3869
3870	/ The following two optimizations are disabled for partial matching. /
3871
3872	if ((mb->moptions & (PCRE2_PARTIAL_HARD\|PCRE2_PARTIAL_SOFT)) == `0`)
3873	{
3874	PCRE2_SPTR p;
3875
3876	/ The minimum matching length is a lower bound; no actual string of that*
3877	length may actually match the pattern. Although the value is, strictly,
3878	in characters, we treat it as code units to avoid spending too much time
3879	in this optimization. /*
3880
3881	if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3882
3883	/ If req_cu is set, we know that that code unit must appear in the*
3884	subject for the match to succeed. If the first code unit is set, req_cu
3885	must be later in the subject; otherwise the test starts at the match
3886	point. This optimization can save a huge amount of backtracking in
3887	patterns with nested unlimited repeats that aren't going to match.
3888	Writing separate code for cased/caseless versions makes it go faster, as
3889	does using an autoincrement and backing off on a match. As in the case of
3890	the first code unit, using memchr() in the 8-bit library gives a big
3891	speed up. Unlike the first_cu check above, we do not need to call
3892	memchr() twice in the caseless case because we only need to check for the
3893	presence of the character in either case, not find the first occurrence.
3894
3895	The search can be skipped if the code unit was found later than the
3896	current starting point in a previous iteration of the bumpalong loop.
3897
3898	HOWEVER: when the subject string is very, very long, searching to its end
3899	can take a long time, and give bad performance on quite ordinary
3900	patterns. This showed up when somebody was matching something like
3901	/^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3902	sufficiently long, but it's worth searching a lot more for unanchored
3903	patterns. /*
3904
3905	p = start_match + (has_first_cu? `1`:`0`);
3906	if (has_req_cu && p > req_cu_ptr)
3907	{
3908	PCRE2_SIZE check_length = end_subject - start_match;
3909
3910	if (check_length < REQ_CU_MAX \|\|
3911	(!anchored && check_length < REQ_CU_MAX * `1000`))
3912	{
3913	if (req_cu != req_cu2) / Caseless /
3914	{
3915	#if PCRE2_CODE_UNIT_WIDTH != 8
3916	while (p < end_subject)
3917	{
3918	uint32_t pp = UCHAR21INCTEST(p);
3919	if (pp == req_cu \|\| pp == req_cu2) { p--; break; }
3920	}
3921	#else /* 8-bit code units */
3922	PCRE2_SPTR pp = p;
3923	p = memchr(pp, req_cu, end_subject - pp);
3924	if (p == NULL)
3925	{
3926	p = memchr(pp, req_cu2, end_subject - pp);
3927	if (p == NULL) p = end_subject;
3928	}
3929	#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
3930	}
3931
3932	/ The caseful case /
3933
3934	else
3935	{
3936	#if PCRE2_CODE_UNIT_WIDTH != 8
3937	while (p < end_subject)
3938	{
3939	if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3940	}
3941
3942	#else /* 8-bit code units */
3943	p = memchr(p, req_cu, end_subject - p);
3944	if (p == NULL) p = end_subject;
3945	#endif
3946	}
3947
3948	/ If we can't find the required code unit, break the matching loop,*
3949	forcing a match failure. /*
3950
3951	if (p >= end_subject) break;
3952
3953	/ If we have found the required code unit, save the point where we*
3954	found it, so that we don't search again next time round the loop if
3955	the start hasn't passed this code unit yet. /*
3956
3957	req_cu_ptr = p;
3958	}
3959	}
3960	}
3961	}
3962
3963	/ ------------ End of start of match optimizations ------------ /
3964
3965	/ Give no match if we have passed the bumpalong limit. /
3966
3967	if (start_match > bumpalong_limit) break;
3968
3969	/ OK, now we can do the business /
3970
3971	mb->start_used_ptr = start_match;
3972	mb->last_used_ptr = start_match;
3973	mb->recursive = NULL;
3974
3975	rc = internal_dfa_match(
3976	mb, / fixed match data /
3977	mb->start_code, / this subexpression's code /
3978	start_match, / where we currently are /
3979	start_offset, / start offset in subject /
3980	match_data->ovector, / offset vector /
3981	(uint32_t)match_data->oveccount * `2`, / actual size of same /
3982	workspace, / workspace vector /
3983	(int)wscount, / size of same /
3984	`0`, / function recurse level /
3985	base_recursion_workspace); / initial workspace for recursion /
3986
3987	/ Anything other than "no match" means we are done, always; otherwise, carry*
3988	on only if not anchored. /*
3989
3990	if (rc != PCRE2_ERROR_NOMATCH \|\| anchored)
3991	{
3992	if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > `0`)
3993	{
3994	match_data->ovector[`0`] = (PCRE2_SIZE)(start_match - subject);
3995	match_data->ovector[`1`] = (PCRE2_SIZE)(end_subject - subject);
3996	}
3997	match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
3998	match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
3999	match_data->startchar = (PCRE2_SIZE)(start_match - subject);
4000	match_data->rc = rc;
4001
4002	if (rc >= `0` &&(options & PCRE2_COPY_MATCHED_SUBJECT) != `0`)
4003	{
4004	length = CU2BYTES(length + was_zero_terminated);
4005	match_data->subject = match_data->memctl.malloc(length,
4006	match_data->memctl.memory_data);
4007	if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
4008	memcpy((void *)match_data->subject, subject, length);
4009	match_data->flags \|= PCRE2_MD_COPIED_SUBJECT;
4010	}
4011	else
4012	{
4013	if (rc >= `0` \|\| rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject;
4014	}
4015	goto EXIT;
4016	}
4017
4018	/ Advance to the next subject character unless we are at the end of a line*
4019	and firstline is set. /*
4020
4021	if (firstline && IS_NEWLINE(start_match)) break;
4022	start_match++;
4023	#ifdef SUPPORT_UNICODE
4024	if (utf)
4025	{
4026	ACROSSCHAR(start_match < end_subject, start_match, start_match++);
4027	}
4028	#endif
4029	if (start_match > end_subject) break;
4030
4031	/ If we have just passed a CR and we are now at a LF, and the pattern does*
4032	not contain any explicit matches for \r or \n, and the newline option is CRLF
4033	or ANY or ANYCRLF, advance the match position by one more character. /*
4034
4035	if (UCHAR21TEST(start_match - `1`) == CHAR_CR &&
4036	start_match < end_subject &&
4037	UCHAR21TEST(start_match) == CHAR_NL &&
4038	(re->flags & PCRE2_HASCRORLF) == `0` &&
4039	(mb->nltype == NLTYPE_ANY \|\|
4040	mb->nltype == NLTYPE_ANYCRLF \|\|
4041	mb->nllen == `2`))
4042	start_match++;
4043
4044	} / "Bumpalong" loop /
4045
4046	NOMATCH_EXIT:
4047	rc = PCRE2_ERROR_NOMATCH;
4048
4049	EXIT:
4050	while (rws->next != NULL)
4051	{
4052	RWS_anchor *next = rws->next;
4053	rws->next = next->next;
4054	mb->memctl.free(next, mb->memctl.memory_data);
4055	}
4056
4057	return rc;
4058	}
4059
4060	/ These #undefs are here to enable unity builds with CMake. /
4061
4062	#undef NLBLOCK /* Block containing newline information */
4063	#undef PSSTART /* Field containing processed string start */
4064	#undef PSEND /* Field containing processed string end */
4065
4066	/ End of pcre2_dfa_match.c /
4067

Browse the source code of Godot/thirdparty/pcre2/src/pcre2_dfa_match.c