pcre_dfa_exec.c source code [ClickHouse/contrib/poco/Foundation/src/pcre_dfa_exec.c]

1	/*************************************************
2	* Perl-Compatible Regular Expressions *
3	*************************************************/
4
5	/ PCRE is a library of functions to support regular expressions whose syntax*
6	and semantics are as close as possible to those of the Perl 5 language (but see
7	below for why this module is different).
8
9	Written by Philip Hazel
10	Copyright (c) 1997-2014 University of Cambridge
11
12	-----------------------------------------------------------------------------
13	Redistribution and use in source and binary forms, with or without
14	modification, are permitted provided that the following conditions are met:
15
16	* Redistributions of source code must retain the above copyright notice,
17	this list of conditions and the following disclaimer.
18
19	* Redistributions in binary form must reproduce the above copyright
20	notice, this list of conditions and the following disclaimer in the
21	documentation and/or other materials provided with the distribution.
22
23	* Neither the name of the University of Cambridge nor the names of its
24	contributors may be used to endorse or promote products derived from
25	this software without specific prior written permission.
26
27	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37	POSSIBILITY OF SUCH DAMAGE.
38	-----------------------------------------------------------------------------
39	*/
40
41	/ This module contains the external function pcre_dfa_exec(), which is an*
42	alternative matching function that uses a sort of DFA algorithm (not a true
43	FSM). This is NOT Perl-compatible, but it has advantages in certain
44	applications. /*
45
46
47	/ NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved*
48	the performance of his patterns greatly. I could not use it as it stood, as it
49	was not thread safe, and made assumptions about pattern sizes. Also, it caused
50	test 7 to loop, and test 9 to crash with a segfault.
51
52	The issue is the check for duplicate states, which is done by a simple linear
53	search up the state list. (Grep for "duplicate" below to find the code.) For
54	many patterns, there will never be many states active at one time, so a simple
55	linear search is fine. In patterns that have many active states, it might be a
56	bottleneck. The suggested code used an indexing scheme to remember which states
57	had previously been used for each character, and avoided the linear search when
58	it knew there was no chance of a duplicate. This was implemented when adding
59	states to the state lists.
60
61	I wrote some thread-safe, not-limited code to try something similar at the time
62	of checking for duplicates (instead of when adding states), using index vectors
63	on the stack. It did give a 13% improvement with one specially constructed
64	pattern for certain subject strings, but on other strings and on many of the
65	simpler patterns in the test suite it did worse. The major problem, I think,
66	was the extra time to initialize the index. This had to be done for each call
67	of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68	only once - I suspect this was the cause of the problems with the tests.)
69
70	Overall, I concluded that the gains in some cases did not outweigh the losses
71	in others, so I abandoned this code. /*
72
73	#pragma warning( disable : 4244) // conversion from 'int' to 'unsigned short', possible loss of data
74	#pragma warning( disable : 4146) // unary minus operator applied to unsigned type, result still unsigned
75
76	#include "pcre_config.h"
77
78	#define NLBLOCK md /* Block containing newline information */
79	#define PSSTART start_subject /* Field containing processed string start */
80	#define PSEND end_subject /* Field containing processed string end */
81
82	#include "pcre_internal.h"
83
84
85	/ For use to indent debugging output /
86
87	#define SP " "
88
89
90	/*************************************************
91	* Code parameters and static tables *
92	*************************************************/
93
94	/ These are offsets that are used to turn the OP_TYPESTAR and friends opcodes*
95	into others, under special conditions. A gap of 20 between the blocks should be
96	enough. The resulting opcodes don't have to be less than 256 because they are
97	never stored, so we push them well clear of the normal opcodes. /*
98
99	#define OP_PROP_EXTRA 300
100	#define OP_EXTUNI_EXTRA 320
101	#define OP_ANYNL_EXTRA 340
102	#define OP_HSPACE_EXTRA 360
103	#define OP_VSPACE_EXTRA 380
104
105
106	/ This table identifies those opcodes that are followed immediately by a*
107	character that is to be tested in some way. This makes it possible to
108	centralize the loading of these characters. In the case of Type etc, the*
109	"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
110	small value. Non-zero values in the table are the offsets from the opcode where
111	the character is to be found. NOTE* If the start of this table is*
112	modified, the three tables that follow must also be modified. /*
113
114	static const pcre_uint8 coptable[] = {
115	`0`, / End /
116	`0`, `0`, `0`, `0`, `0`, / \A, \G, \K, \B, \b /
117	`0`, `0`, `0`, `0`, `0`, `0`, / \D, \d, \S, \s, \W, \w /
118	`0`, `0`, `0`, / Any, AllAny, Anybyte /
119	`0`, `0`, / \P, \p /
120	`0`, `0`, `0`, `0`, `0`, / \R, \H, \h, \V, \v /
121	`0`, / \X /
122	`0`, `0`, `0`, `0`, `0`, `0`, / \Z, \z, $, $M, ^, ^M /
123	`1`, / Char /
124	`1`, / Chari /
125	`1`, / not /
126	`1`, / noti /
127	/ Positive single-char repeats /
128	`1`, `1`, `1`, `1`, `1`, `1`, / , ?, +, +?, ?, ?? /
129	`1`+IMM2_SIZE, `1`+IMM2_SIZE, / upto, minupto /
130	`1`+IMM2_SIZE, / exact /
131	`1`, `1`, `1`, `1`+IMM2_SIZE, / +, ++, ?+, upto+ /*
132	`1`, `1`, `1`, `1`, `1`, `1`, / I, ?I, +I, +?I, ?I, ??I /
133	`1`+IMM2_SIZE, `1`+IMM2_SIZE, / upto I, minupto I /
134	`1`+IMM2_SIZE, / exact I /
135	`1`, `1`, `1`, `1`+IMM2_SIZE, / +I, ++I, ?+I, upto+I /*
136	/ Negative single-char repeats - only for chars < 256 /
137	`1`, `1`, `1`, `1`, `1`, `1`, / NOT , ?, +, +?, ?, ?? /
138	`1`+IMM2_SIZE, `1`+IMM2_SIZE, / NOT upto, minupto /
139	`1`+IMM2_SIZE, / NOT exact /
140	`1`, `1`, `1`, `1`+IMM2_SIZE, / NOT +, ++, ?+, upto+ /*
141	`1`, `1`, `1`, `1`, `1`, `1`, / NOT I, ?I, +I, +?I, ?I, ??I /
142	`1`+IMM2_SIZE, `1`+IMM2_SIZE, / NOT upto I, minupto I /
143	`1`+IMM2_SIZE, / NOT exact I /
144	`1`, `1`, `1`, `1`+IMM2_SIZE, / NOT +I, ++I, ?+I, upto+I /*
145	/ Positive type repeats /
146	`1`, `1`, `1`, `1`, `1`, `1`, / Type , ?, +, +?, ?, ?? /
147	`1`+IMM2_SIZE, `1`+IMM2_SIZE, / Type upto, minupto /
148	`1`+IMM2_SIZE, / Type exact /
149	`1`, `1`, `1`, `1`+IMM2_SIZE, / Type +, ++, ?+, upto+ /*
150	/ Character class & ref repeats /
151	`0`, `0`, `0`, `0`, `0`, `0`, / , ?, +, +?, ?, ?? /
152	`0`, `0`, / CRRANGE, CRMINRANGE /
153	`0`, `0`, `0`, `0`, / Possessive +, ++, ?+, CRPOSRANGE /*
154	`0`, / CLASS /
155	`0`, / NCLASS /
156	`0`, / XCLASS - variable length /
157	`0`, / REF /
158	`0`, / REFI /
159	`0`, / DNREF /
160	`0`, / DNREFI /
161	`0`, / RECURSE /
162	`0`, / CALLOUT /
163	`0`, / Alt /
164	`0`, / Ket /
165	`0`, / KetRmax /
166	`0`, / KetRmin /
167	`0`, / KetRpos /
168	`0`, / Reverse /
169	`0`, / Assert /
170	`0`, / Assert not /
171	`0`, / Assert behind /
172	`0`, / Assert behind not /
173	`0`, `0`, / ONCE, ONCE_NC /
174	`0`, `0`, `0`, `0`, `0`, / BRA, BRAPOS, CBRA, CBRAPOS, COND /
175	`0`, `0`, `0`, `0`, `0`, / SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND /
176	`0`, `0`, / CREF, DNCREF /
177	`0`, `0`, / RREF, DNRREF /
178	`0`, / DEF /
179	`0`, `0`, `0`, / BRAZERO, BRAMINZERO, BRAPOSZERO /
180	`0`, `0`, `0`, / MARK, PRUNE, PRUNE_ARG /
181	`0`, `0`, `0`, `0`, / SKIP, SKIP_ARG, THEN, THEN_ARG /
182	`0`, `0`, `0`, `0`, / COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT /
183	`0`, `0` / CLOSE, SKIPZERO /
184	};
185
186	/ This table identifies those opcodes that inspect a character. It is used to*
187	remember the fact that a character could have been inspected when the end of
188	the subject is reached. NOTE* If the start of this table is modified, the*
189	two tables that follow must also be modified. /*
190
191	static const pcre_uint8 poptable[] = {
192	`0`, / End /
193	`0`, `0`, `0`, `1`, `1`, / \A, \G, \K, \B, \b /
194	`1`, `1`, `1`, `1`, `1`, `1`, / \D, \d, \S, \s, \W, \w /
195	`1`, `1`, `1`, / Any, AllAny, Anybyte /
196	`1`, `1`, / \P, \p /
197	`1`, `1`, `1`, `1`, `1`, / \R, \H, \h, \V, \v /
198	`1`, / \X /
199	`0`, `0`, `0`, `0`, `0`, `0`, / \Z, \z, $, $M, ^, ^M /
200	`1`, / Char /
201	`1`, / Chari /
202	`1`, / not /
203	`1`, / noti /
204	/ Positive single-char repeats /
205	`1`, `1`, `1`, `1`, `1`, `1`, / , ?, +, +?, ?, ?? /
206	`1`, `1`, `1`, / upto, minupto, exact /
207	`1`, `1`, `1`, `1`, / +, ++, ?+, upto+ /*
208	`1`, `1`, `1`, `1`, `1`, `1`, / I, ?I, +I, +?I, ?I, ??I /
209	`1`, `1`, `1`, / upto I, minupto I, exact I /
210	`1`, `1`, `1`, `1`, / +I, ++I, ?+I, upto+I /*
211	/ Negative single-char repeats - only for chars < 256 /
212	`1`, `1`, `1`, `1`, `1`, `1`, / NOT , ?, +, +?, ?, ?? /
213	`1`, `1`, `1`, / NOT upto, minupto, exact /
214	`1`, `1`, `1`, `1`, / NOT +, ++, ?+, upto+ /*
215	`1`, `1`, `1`, `1`, `1`, `1`, / NOT I, ?I, +I, +?I, ?I, ??I /
216	`1`, `1`, `1`, / NOT upto I, minupto I, exact I /
217	`1`, `1`, `1`, `1`, / NOT +I, ++I, ?+I, upto+I /*
218	/ Positive type repeats /
219	`1`, `1`, `1`, `1`, `1`, `1`, / Type , ?, +, +?, ?, ?? /
220	`1`, `1`, `1`, / Type upto, minupto, exact /
221	`1`, `1`, `1`, `1`, / Type +, ++, ?+, upto+ /*
222	/ Character class & ref repeats /
223	`1`, `1`, `1`, `1`, `1`, `1`, / , ?, +, +?, ?, ?? /
224	`1`, `1`, / CRRANGE, CRMINRANGE /
225	`1`, `1`, `1`, `1`, / Possessive +, ++, ?+, CRPOSRANGE /*
226	`1`, / CLASS /
227	`1`, / NCLASS /
228	`1`, / XCLASS - variable length /
229	`0`, / REF /
230	`0`, / REFI /
231	`0`, / DNREF /
232	`0`, / DNREFI /
233	`0`, / RECURSE /
234	`0`, / CALLOUT /
235	`0`, / Alt /
236	`0`, / Ket /
237	`0`, / KetRmax /
238	`0`, / KetRmin /
239	`0`, / KetRpos /
240	`0`, / Reverse /
241	`0`, / Assert /
242	`0`, / Assert not /
243	`0`, / Assert behind /
244	`0`, / Assert behind not /
245	`0`, `0`, / ONCE, ONCE_NC /
246	`0`, `0`, `0`, `0`, `0`, / BRA, BRAPOS, CBRA, CBRAPOS, COND /
247	`0`, `0`, `0`, `0`, `0`, / SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND /
248	`0`, `0`, / CREF, DNCREF /
249	`0`, `0`, / RREF, DNRREF /
250	`0`, / DEF /
251	`0`, `0`, `0`, / BRAZERO, BRAMINZERO, BRAPOSZERO /
252	`0`, `0`, `0`, / MARK, PRUNE, PRUNE_ARG /
253	`0`, `0`, `0`, `0`, / SKIP, SKIP_ARG, THEN, THEN_ARG /
254	`0`, `0`, `0`, `0`, / COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT /
255	`0`, `0` / CLOSE, SKIPZERO /
256	};
257
258	/ These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,*
259	and \w /*
260
261	static const pcre_uint8 toptable1[] = {
262	`0`, `0`, `0`, `0`, `0`, `0`,
263	ctype_digit, ctype_digit,
264	ctype_space, ctype_space,
265	ctype_word, ctype_word,
266	`0`, `0` / OP_ANY, OP_ALLANY /
267	};
268
269	static const pcre_uint8 toptable2[] = {
270	`0`, `0`, `0`, `0`, `0`, `0`,
271	ctype_digit, `0`,
272	ctype_space, `0`,
273	ctype_word, `0`,
274	`1`, `1` / OP_ANY, OP_ALLANY /
275	};
276
277
278	/ Structure for holding data about a particular state, which is in effect the*
279	current data for an active path through the match tree. It must consist
280	entirely of ints because the working vector we are passed, and which we put
281	these structures in, is a vector of ints. /*
282
283	typedef struct stateblock {
284	int offset; / Offset to opcode /
285	int count; / Count for repeats /
286	int data; / Some use extra data /
287	} stateblock;
288
289	#define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
290
291
292	#ifdef PCRE_DEBUG
293	/*************************************************
294	* Print character string *
295	*************************************************/
296
297	/ Character string printing function for debugging.*
298
299	Arguments:
300	p points to string
301	length number of bytes
302	f where to print
303
304	Returns: nothing
305	*/
306
307	static void
308	pchars(const pcre_uchar p, int* length, FILE *f)
309	{
310	pcre_uint32 c;
311	while (length-- > `0`)
312	{
313	if (isprint(c = *(p++)))
314	fprintf(f, "%c", c);
315	else
316	fprintf(f, "\\x{%02x}", c);
317	}
318	}
319	#endif
320
321
322
323	/*************************************************
324	* Execute a Regular Expression - DFA engine *
325	*************************************************/
326
327	/ This internal function applies a compiled pattern to a subject string,*
328	starting at a given point, using a DFA engine. This function is called from the
329	external one, possibly multiple times if the pattern is not anchored. The
330	function calls itself recursively for some kinds of subpattern.
331
332	Arguments:
333	md the match_data block with fixed information
334	this_start_code the opening bracket of this subexpression's code
335	current_subject where we currently are in the subject string
336	start_offset start offset in the subject string
337	offsets vector to contain the matching string offsets
338	offsetcount size of same
339	workspace vector of workspace
340	wscount size of same
341	rlevel function call recursion level
342
343	Returns: > 0 => number of match offset pairs placed in offsets
344	= 0 => offsets overflowed; longest matches are present
345	-1 => failed to match
346	< -1 => some kind of unexpected problem
347
348	The following macros are used for adding states to the two state vectors (one
349	for the current character, one for the following character). /*
350
351	#define ADD_ACTIVE(x,y) \
352	if (active_count++ < wscount) \
353	{ \
354	next_active_state->offset = (x); \
355	next_active_state->count = (y); \
356	next_active_state++; \
357	DPRINTF(("%.sADD_ACTIVE(%d,%d)\n", rlevel2-2, SP, (x), (y))); \
358	} \
359	else return PCRE_ERROR_DFA_WSSIZE
360
361	#define ADD_ACTIVE_DATA(x,y,z) \
362	if (active_count++ < wscount) \
363	{ \
364	next_active_state->offset = (x); \
365	next_active_state->count = (y); \
366	next_active_state->data = (z); \
367	next_active_state++; \
368	DPRINTF(("%.sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel2-2, SP, (x), (y), (z))); \
369	} \
370	else return PCRE_ERROR_DFA_WSSIZE
371
372	#define ADD_NEW(x,y) \
373	if (new_count++ < wscount) \
374	{ \
375	next_new_state->offset = (x); \
376	next_new_state->count = (y); \
377	next_new_state++; \
378	DPRINTF(("%.sADD_NEW(%d,%d)\n", rlevel2-2, SP, (x), (y))); \
379	} \
380	else return PCRE_ERROR_DFA_WSSIZE
381
382	#define ADD_NEW_DATA(x,y,z) \
383	if (new_count++ < wscount) \
384	{ \
385	next_new_state->offset = (x); \
386	next_new_state->count = (y); \
387	next_new_state->data = (z); \
388	next_new_state++; \
389	DPRINTF(("%.sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel2-2, SP, \
390	(x), (y), (z), __LINE__)); \
391	} \
392	else return PCRE_ERROR_DFA_WSSIZE
393
394	/ And now, here is the code /
395
396	static int
397	internal_dfa_exec(
398	dfa_match_data *md,
399	const pcre_uchar *this_start_code,
400	const pcre_uchar *current_subject,
401	int start_offset,
402	int *offsets,
403	int offsetcount,
404	int *workspace,
405	int wscount,
406	int rlevel)
407	{
408	stateblock active_states, new_states, *temp_states;
409	stateblock next_active_state, next_new_state;
410
411	const pcre_uint8 ctypes, lcc, *fcc;
412	const pcre_uchar *ptr;
413	const pcre_uchar end_code, first_op;
414
415	dfa_recursion_info new_recursive;
416
417	int active_count, new_count, match_count;
418
419	/ Some fields in the md block are frequently referenced, so we load them into*
420	independent variables in the hope that this will perform better. /*
421
422	const pcre_uchar *start_subject = md->start_subject;
423	const pcre_uchar *end_subject = md->end_subject;
424	const pcre_uchar *start_code = md->start_code;
425
426	#ifdef SUPPORT_UTF
427	BOOL utf = (md->poptions & PCRE_UTF8) != `0`;
428	#else
429	BOOL utf = FALSE;
430	#endif
431
432	BOOL reset_could_continue = FALSE;
433
434	rlevel++;
435	offsetcount &= (-`2`);
436
437	wscount -= `2`;
438	wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * `2`))) /
439	(`2` * INTS_PER_STATEBLOCK);
440
441	DPRINTF(("\n%.*s---------------------\n"
442	"%.*sCall to internal_dfa_exec f=%d\n",
443	rlevel`2`-`2`, SP, rlevel`2`-`2`, SP, rlevel));
444
445	ctypes = md->tables + ctypes_offset;
446	lcc = md->tables + lcc_offset;
447	fcc = md->tables + fcc_offset;
448
449	match_count = PCRE_ERROR_NOMATCH; / A negative number /
450
451	active_states = (stateblock *)(workspace + `2`);
452	next_new_state = new_states = active_states + wscount;
453	new_count = `0`;
454
455	first_op = this_start_code + `1` + LINK_SIZE +
456	((this_start_code == OP_CBRA \|\| this_start_code == OP_SCBRA \|\|
457	this_start_code == OP_CBRAPOS \|\| this_start_code == OP_SCBRAPOS)
458	? IMM2_SIZE:`0`);
459
460	/ The first thing in any (sub) pattern is a bracket of some sort. Push all*
461	the alternative states onto the list, and find out where the end is. This
462	makes is possible to use this function recursively, when we want to stop at a
463	matching internal ket rather than at the end.
464
465	If the first opcode in the first alternative is OP_REVERSE, we are dealing with
466	a backward assertion. In that case, we have to find out the maximum amount to
467	move back, and set up each alternative appropriately. /*
468
469	if (*first_op == OP_REVERSE)
470	{
471	int max_back = `0`;
472	int gone_back;
473
474	end_code = this_start_code;
475	do
476	{
477	int back = GET(end_code, `2`+LINK_SIZE);
478	if (back > max_back) max_back = back;
479	end_code += GET(end_code, `1`);
480	}
481	while (*end_code == OP_ALT);
482
483	/ If we can't go back the amount required for the longest lookbehind*
484	pattern, go back as far as we can; some alternatives may still be viable. /*
485
486	#ifdef SUPPORT_UTF
487	/ In character mode we have to step back character by character /
488
489	if (utf)
490	{
491	for (gone_back = `0`; gone_back < max_back; gone_back++)
492	{
493	if (current_subject <= start_subject) break;
494	current_subject--;
495	ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
496	}
497	}
498	else
499	#endif
500
501	/ In byte-mode we can do this quickly. /
502
503	{
504	gone_back = (current_subject - max_back < start_subject)?
505	(int)(current_subject - start_subject) : max_back;
506	current_subject -= gone_back;
507	}
508
509	/ Save the earliest consulted character /
510
511	if (current_subject < md->start_used_ptr)
512	md->start_used_ptr = current_subject;
513
514	/ Now we can process the individual branches. /
515
516	end_code = this_start_code;
517	do
518	{
519	int back = GET(end_code, `2`+LINK_SIZE);
520	if (back <= gone_back)
521	{
522	int bstate = (int)(end_code - start_code + `2` + `2`*LINK_SIZE);
523	ADD_NEW_DATA(-bstate, `0`, gone_back - back);
524	}
525	end_code += GET(end_code, `1`);
526	}
527	while (*end_code == OP_ALT);
528	}
529
530	/ This is the code for a "normal" subpattern (not a backward assertion). The*
531	start of a whole pattern is always one of these. If we are at the top level,
532	we may be asked to restart matching from the same point that we reached for a
533	previous partial match. We still have to scan through the top-level branches to
534	find the end state. /*
535
536	else
537	{
538	end_code = this_start_code;
539
540	/ Restarting /
541
542	if (rlevel == `1` && (md->moptions & PCRE_DFA_RESTART) != `0`)
543	{
544	do { end_code += GET(end_code, `1`); } while (*end_code == OP_ALT);
545	new_count = workspace[`1`];
546	if (!workspace[`0`])
547	memcpy(new_states, active_states, new_count * sizeof(stateblock));
548	}
549
550	/ Not restarting /
551
552	else
553	{
554	int length = `1` + LINK_SIZE +
555	((this_start_code == OP_CBRA \|\| this_start_code == OP_SCBRA \|\|
556	this_start_code == OP_CBRAPOS \|\| this_start_code == OP_SCBRAPOS)
557	? IMM2_SIZE:`0`);
558	do
559	{
560	ADD_NEW((int)(end_code - start_code + length), `0`);
561	end_code += GET(end_code, `1`);
562	length = `1` + LINK_SIZE;
563	}
564	while (*end_code == OP_ALT);
565	}
566	}
567
568	workspace[`0`] = `0`; / Bit indicating which vector is current /
569
570	DPRINTF(("%.sEnd state = %d\n", rlevel`2`-`2`, SP, (int)(end_code - start_code)));
571
572	/ Loop for scanning the subject /
573
574	ptr = current_subject;
575	for (;;)
576	{
577	int i, j;
578	int clen, dlen;
579	pcre_uint32 c, d;
580	int forced_fail = `0`;
581	BOOL partial_newline = FALSE;
582	BOOL could_continue = reset_could_continue;
583	reset_could_continue = FALSE;
584
585	/ Make the new state list into the active state list and empty the*
586	new state list. /*
587
588	temp_states = active_states;
589	active_states = new_states;
590	new_states = temp_states;
591	active_count = new_count;
592	new_count = `0`;
593
594	workspace[`0`] ^= `1`; / Remember for the restarting feature /
595	workspace[`1`] = active_count;
596
597	#ifdef PCRE_DEBUG
598	printf("%.sNext character: rest of subject = \"", rlevel`2`-`2`, SP);
599	pchars(ptr, STRLEN_UC(ptr), stdout);
600	printf("\"\n");
601
602	printf("%.sActive states: ", rlevel`2`-`2`, SP);
603	for (i = `0`; i < active_count; i++)
604	printf("%d/%d ", active_states[i].offset, active_states[i].count);
605	printf("\n");
606	#endif
607
608	/ Set the pointers for adding new states /
609
610	next_active_state = active_states + active_count;
611	next_new_state = new_states;
612
613	/ Load the current character from the subject outside the loop, as many*
614	different states may want to look at it, and we assume that at least one
615	will. /*
616
617	if (ptr < end_subject)
618	{
619	clen = `1`; / Number of data items in the character /
620	#ifdef SUPPORT_UTF
621	GETCHARLENTEST(c, ptr, clen);
622	#else
623	c = *ptr;
624	#endif /* SUPPORT_UTF */
625	}
626	else
627	{
628	clen = `0`; / This indicates the end of the subject /
629	c = NOTACHAR; / This value should never actually be used /
630	}
631
632	/ Scan up the active states and act on each one. The result of an action*
633	may be to add more states to the currently active list (e.g. on hitting a
634	parenthesis) or it may be to put states on the new list, for considering
635	when we move the character pointer on. /*
636
637	for (i = `0`; i < active_count; i++)
638	{
639	stateblock *current_state = active_states + i;
640	BOOL caseless = FALSE;
641	const pcre_uchar *code;
642	int state_offset = current_state->offset;
643	int codevalue, rrc;
644	int count;
645
646	#ifdef PCRE_DEBUG
647	printf ("%.sProcessing state %d c=", rlevel`2`-`2`, SP, state_offset);
648	if (clen == `0`) printf("EOL\n");
649	else if (c > `32` && c < `127`) printf("'%c'\n", c);
650	else printf("0x%02x\n", c);
651	#endif
652
653	/ A negative offset is a special case meaning "hold off going to this*
654	(negated) state until the number of characters in the data field have
655	been skipped". If the could_continue flag was passed over from a previous
656	state, arrange for it to passed on. /*
657
658	if (state_offset < `0`)
659	{
660	if (current_state->data > `0`)
661	{
662	DPRINTF(("%.sSkipping this character\n", rlevel`2`-`2`, SP));
663	ADD_NEW_DATA(state_offset, current_state->count,
664	current_state->data - `1`);
665	if (could_continue) reset_could_continue = TRUE;
666	continue;
667	}
668	else
669	{
670	current_state->offset = state_offset = -state_offset;
671	}
672	}
673
674	/ Check for a duplicate state with the same count, and skip if found.*
675	See the note at the head of this module about the possibility of improving
676	performance here. /*
677
678	for (j = `0`; j < i; j++)
679	{
680	if (active_states[j].offset == state_offset &&
681	active_states[j].count == current_state->count)
682	{
683	DPRINTF(("%.sDuplicate state: skipped\n", rlevel`2`-`2`, SP));
684	goto NEXT_ACTIVE_STATE;
685	}
686	}
687
688	/ The state offset is the offset to the opcode /
689
690	code = start_code + state_offset;
691	codevalue = *code;
692
693	/ If this opcode inspects a character, but we are at the end of the*
694	subject, remember the fact for use when testing for a partial match. /*
695
696	if (clen == `0` && poptable[codevalue] != `0`)
697	could_continue = TRUE;
698
699	/ If this opcode is followed by an inline character, load it. It is*
700	tempting to test for the presence of a subject character here, but that
701	is wrong, because sometimes zero repetitions of the subject are
702	permitted.
703
704	We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
705	argument that is not a data character - but is always one byte long because
706	the values are small. We have to take special action to deal with \P, \p,
707	\H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
708	these ones to new opcodes. /*
709
710	if (coptable[codevalue] > `0`)
711	{
712	dlen = `1`;
713	#ifdef SUPPORT_UTF
714	if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
715	#endif /* SUPPORT_UTF */
716	d = code[coptable[codevalue]];
717	if (codevalue >= OP_TYPESTAR)
718	{
719	switch(d)
720	{
721	case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
722	case OP_NOTPROP:
723	case OP_PROP: codevalue += OP_PROP_EXTRA; break;
724	case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
725	case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
726	case OP_NOT_HSPACE:
727	case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
728	case OP_NOT_VSPACE:
729	case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
730	default: break;
731	}
732	}
733	}
734	else
735	{
736	dlen = `0`; / Not strictly necessary, but compilers moan /
737	d = NOTACHAR; / if these variables are not set. /
738	}
739
740
741	/ Now process the individual opcodes /
742
743	switch (codevalue)
744	{
745	/ ========================================================================== /
746	/ These cases are never obeyed. This is a fudge that causes a compile-*
747	time error if the vectors coptable or poptable, which are indexed by
748	opcode, are not the correct length. It seems to be the only way to do
749	such a check at compile time, as the sizeof() operator does not work
750	in the C preprocessor. /*
751
752	case OP_TABLE_LENGTH:
753	case OP_TABLE_LENGTH +
754	((sizeof(coptable) == OP_TABLE_LENGTH) &&
755	(sizeof(poptable) == OP_TABLE_LENGTH)):
756	break;
757
758	/ ========================================================================== /
759	/ Reached a closing bracket. If not at the end of the pattern, carry*
760	on with the next opcode. For repeating opcodes, also add the repeat
761	state. Note that KETRPOS will always be encountered at the end of the
762	subpattern, because the possessive subpattern repeats are always handled
763	using recursive calls. Thus, it never adds any new states.
764
765	At the end of the (sub)pattern, unless we have an empty string and
766	PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
767	start of the subject, save the match data, shifting up all previous
768	matches so we always have the longest first. /*
769
770	case OP_KET:
771	case OP_KETRMIN:
772	case OP_KETRMAX:
773	case OP_KETRPOS:
774	if (code != end_code)
775	{
776	ADD_ACTIVE(state_offset + `1` + LINK_SIZE, `0`);
777	if (codevalue != OP_KET)
778	{
779	ADD_ACTIVE(state_offset - GET(code, `1`), `0`);
780	}
781	}
782	else
783	{
784	if (ptr > current_subject \|\|
785	((md->moptions & PCRE_NOTEMPTY) == `0` &&
786	((md->moptions & PCRE_NOTEMPTY_ATSTART) == `0` \|\|
787	current_subject > start_subject + md->start_offset)))
788	{
789	if (match_count < `0`) match_count = (offsetcount >= `2`)? `1` : `0`;
790	else if (match_count > `0` && ++match_count * `2` > offsetcount)
791	match_count = `0`;
792	count = ((match_count == `0`)? offsetcount : match_count * `2`) - `2`;
793	if (count > `0`) memmove(offsets + `2`, offsets, count * sizeof(int));
794	if (offsetcount >= `2`)
795	{
796	offsets[`0`] = (int)(current_subject - start_subject);
797	offsets[`1`] = (int)(ptr - start_subject);
798	DPRINTF(("%.sSet matched string = \"%.s\"\n", rlevel*`2`-`2`, SP,
799	offsets[`1`] - offsets[`0`], (char *)current_subject));
800	}
801	if ((md->moptions & PCRE_DFA_SHORTEST) != `0`)
802	{
803	DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
804	"%.s---------------------\n\n", rlevel`2`-`2`, SP, rlevel,
805	match_count, rlevel*`2`-`2`, SP));
806	return match_count;
807	}
808	}
809	}
810	break;
811
812	/ ========================================================================== /
813	/ These opcodes add to the current list of states without looking*
814	at the current character. /*
815
816	/-----------------------------------------------------------------/
817	case OP_ALT:
818	do { code += GET(code, `1`); } while (*code == OP_ALT);
819	ADD_ACTIVE((int)(code - start_code), `0`);
820	break;
821
822	/-----------------------------------------------------------------/
823	case OP_BRA:
824	case OP_SBRA:
825	do
826	{
827	ADD_ACTIVE((int)(code - start_code + `1` + LINK_SIZE), `0`);
828	code += GET(code, `1`);
829	}
830	while (*code == OP_ALT);
831	break;
832
833	/-----------------------------------------------------------------/
834	case OP_CBRA:
835	case OP_SCBRA:
836	ADD_ACTIVE((int)(code - start_code + `1` + LINK_SIZE + IMM2_SIZE), `0`);
837	code += GET(code, `1`);
838	while (*code == OP_ALT)
839	{
840	ADD_ACTIVE((int)(code - start_code + `1` + LINK_SIZE), `0`);
841	code += GET(code, `1`);
842	}
843	break;
844
845	/-----------------------------------------------------------------/
846	case OP_BRAZERO:
847	case OP_BRAMINZERO:
848	ADD_ACTIVE(state_offset + `1`, `0`);
849	code += `1` + GET(code, `2`);
850	while (*code == OP_ALT) code += GET(code, `1`);
851	ADD_ACTIVE((int)(code - start_code + `1` + LINK_SIZE), `0`);
852	break;
853
854	/-----------------------------------------------------------------/
855	case OP_SKIPZERO:
856	code += `1` + GET(code, `2`);
857	while (*code == OP_ALT) code += GET(code, `1`);
858	ADD_ACTIVE((int)(code - start_code + `1` + LINK_SIZE), `0`);
859	break;
860
861	/-----------------------------------------------------------------/
862	case OP_CIRC:
863	if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == `0`)
864	{ ADD_ACTIVE(state_offset + `1`, `0`); }
865	break;
866
867	/-----------------------------------------------------------------/
868	case OP_CIRCM:
869	if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == `0`) \|\|
870	(ptr != end_subject && WAS_NEWLINE(ptr)))
871	{ ADD_ACTIVE(state_offset + `1`, `0`); }
872	break;
873
874	/-----------------------------------------------------------------/
875	case OP_EOD:
876	if (ptr >= end_subject)
877	{
878	if ((md->moptions & PCRE_PARTIAL_HARD) != `0`)
879	could_continue = TRUE;
880	else { ADD_ACTIVE(state_offset + `1`, `0`); }
881	}
882	break;
883
884	/-----------------------------------------------------------------/
885	case OP_SOD:
886	if (ptr == start_subject) { ADD_ACTIVE(state_offset + `1`, `0`); }
887	break;
888
889	/-----------------------------------------------------------------/
890	case OP_SOM:
891	if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + `1`, `0`); }
892	break;
893
894
895	/ ========================================================================== /
896	/ These opcodes inspect the next subject character, and sometimes*
897	the previous one as well, but do not have an argument. The variable
898	clen contains the length of the current character and is zero if we are
899	at the end of the subject. /*
900
901	/-----------------------------------------------------------------/
902	case OP_ANY:
903	if (clen > `0` && !IS_NEWLINE(ptr))
904	{
905	if (ptr + `1` >= md->end_subject &&
906	(md->moptions & (PCRE_PARTIAL_HARD)) != `0` &&
907	NLBLOCK->nltype == NLTYPE_FIXED &&
908	NLBLOCK->nllen == `2` &&
909	c == NLBLOCK->nl[`0`])
910	{
911	could_continue = partial_newline = TRUE;
912	}
913	else
914	{
915	ADD_NEW(state_offset + `1`, `0`);
916	}
917	}
918	break;
919
920	/-----------------------------------------------------------------/
921	case OP_ALLANY:
922	if (clen > `0`)
923	{ ADD_NEW(state_offset + `1`, `0`); }
924	break;
925
926	/-----------------------------------------------------------------/
927	case OP_EODN:
928	if (clen == `0` && (md->moptions & PCRE_PARTIAL_HARD) != `0`)
929	could_continue = TRUE;
930	else if (clen == `0` \|\| (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
931	{ ADD_ACTIVE(state_offset + `1`, `0`); }
932	break;
933
934	/-----------------------------------------------------------------/
935	case OP_DOLL:
936	if ((md->moptions & PCRE_NOTEOL) == `0`)
937	{
938	if (clen == `0` && (md->moptions & PCRE_PARTIAL_HARD) != `0`)
939	could_continue = TRUE;
940	else if (clen == `0` \|\|
941	((md->poptions & PCRE_DOLLAR_ENDONLY) == `0` && IS_NEWLINE(ptr) &&
942	(ptr == end_subject - md->nllen)
943	))
944	{ ADD_ACTIVE(state_offset + `1`, `0`); }
945	else if (ptr + `1` >= md->end_subject &&
946	(md->moptions & (PCRE_PARTIAL_HARD\|PCRE_PARTIAL_SOFT)) != `0` &&
947	NLBLOCK->nltype == NLTYPE_FIXED &&
948	NLBLOCK->nllen == `2` &&
949	c == NLBLOCK->nl[`0`])
950	{
951	if ((md->moptions & PCRE_PARTIAL_HARD) != `0`)
952	{
953	reset_could_continue = TRUE;
954	ADD_NEW_DATA(-(state_offset + `1`), `0`, `1`);
955	}
956	else could_continue = partial_newline = TRUE;
957	}
958	}
959	break;
960
961	/-----------------------------------------------------------------/
962	case OP_DOLLM:
963	if ((md->moptions & PCRE_NOTEOL) == `0`)
964	{
965	if (clen == `0` && (md->moptions & PCRE_PARTIAL_HARD) != `0`)
966	could_continue = TRUE;
967	else if (clen == `0` \|\|
968	((md->poptions & PCRE_DOLLAR_ENDONLY) == `0` && IS_NEWLINE(ptr)))
969	{ ADD_ACTIVE(state_offset + `1`, `0`); }
970	else if (ptr + `1` >= md->end_subject &&
971	(md->moptions & (PCRE_PARTIAL_HARD\|PCRE_PARTIAL_SOFT)) != `0` &&
972	NLBLOCK->nltype == NLTYPE_FIXED &&
973	NLBLOCK->nllen == `2` &&
974	c == NLBLOCK->nl[`0`])
975	{
976	if ((md->moptions & PCRE_PARTIAL_HARD) != `0`)
977	{
978	reset_could_continue = TRUE;
979	ADD_NEW_DATA(-(state_offset + `1`), `0`, `1`);
980	}
981	else could_continue = partial_newline = TRUE;
982	}
983	}
984	else if (IS_NEWLINE(ptr))
985	{ ADD_ACTIVE(state_offset + `1`, `0`); }
986	break;
987
988	/-----------------------------------------------------------------/
989
990	case OP_DIGIT:
991	case OP_WHITESPACE:
992	case OP_WORDCHAR:
993	if (clen > `0` && c < `256` &&
994	((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != `0`)
995	{ ADD_NEW(state_offset + `1`, `0`); }
996	break;
997
998	/-----------------------------------------------------------------/
999	case OP_NOT_DIGIT:
1000	case OP_NOT_WHITESPACE:
1001	case OP_NOT_WORDCHAR:
1002	if (clen > `0` && (c >= `256` \|\|
1003	((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != `0`))
1004	{ ADD_NEW(state_offset + `1`, `0`); }
1005	break;
1006
1007	/-----------------------------------------------------------------/
1008	case OP_WORD_BOUNDARY:
1009	case OP_NOT_WORD_BOUNDARY:
1010	{
1011	int left_word, right_word;
1012
1013	if (ptr > start_subject)
1014	{
1015	const pcre_uchar *temp = ptr - `1`;
1016	if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1017	#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1018	if (utf) { BACKCHAR(temp); }
1019	#endif
1020	GETCHARTEST(d, temp);
1021	#ifdef SUPPORT_UCP
1022	if ((md->poptions & PCRE_UCP) != `0`)
1023	{
1024	if (d == `'_'`) left_word = TRUE; else
1025	{
1026	int cat = UCD_CATEGORY(d);
1027	left_word = (cat == ucp_L \|\| cat == ucp_N);
1028	}
1029	}
1030	else
1031	#endif
1032	left_word = d < `256` && (ctypes[d] & ctype_word) != `0`;
1033	}
1034	else left_word = FALSE;
1035
1036	if (clen > `0`)
1037	{
1038	#ifdef SUPPORT_UCP
1039	if ((md->poptions & PCRE_UCP) != `0`)
1040	{
1041	if (c == `'_'`) right_word = TRUE; else
1042	{
1043	int cat = UCD_CATEGORY(c);
1044	right_word = (cat == ucp_L \|\| cat == ucp_N);
1045	}
1046	}
1047	else
1048	#endif
1049	right_word = c < `256` && (ctypes[c] & ctype_word) != `0`;
1050	}
1051	else right_word = FALSE;
1052
1053	if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1054	{ ADD_ACTIVE(state_offset + `1`, `0`); }
1055	}
1056	break;
1057
1058
1059	/-----------------------------------------------------------------/
1060	/ Check the next character by Unicode property. We will get here only*
1061	if the support is in the binary; otherwise a compile-time error occurs.
1062	*/
1063
1064	#ifdef SUPPORT_UCP
1065	case OP_PROP:
1066	case OP_NOTPROP:
1067	if (clen > `0`)
1068	{
1069	BOOL OK;
1070	const pcre_uint32 *cp;
1071	const ucd_record * prop = GET_UCD(c);
1072	switch(code[`1`])
1073	{
1074	case PT_ANY:
1075	OK = TRUE;
1076	break;
1077
1078	case PT_LAMP:
1079	OK = prop->chartype == ucp_Lu \|\| prop->chartype == ucp_Ll \|\|
1080	prop->chartype == ucp_Lt;
1081	break;
1082
1083	case PT_GC:
1084	OK = PRIV(ucp_gentype)[prop->chartype] == code[`2`];
1085	break;
1086
1087	case PT_PC:
1088	OK = prop->chartype == code[`2`];
1089	break;
1090
1091	case PT_SC:
1092	OK = prop->script == code[`2`];
1093	break;
1094
1095	/ These are specials for combination cases. /
1096
1097	case PT_ALNUM:
1098	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
1099	PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1100	break;
1101
1102	/ Perl space used to exclude VT, but from Perl 5.18 it is included,*
1103	which means that Perl space and POSIX space are now identical. PCRE
1104	was changed at release 8.34. /*
1105
1106	case PT_SPACE: / Perl space /
1107	case PT_PXSPACE: / POSIX space /
1108	switch(c)
1109	{
1110	HSPACE_CASES:
1111	VSPACE_CASES:
1112	OK = TRUE;
1113	break;
1114
1115	default:
1116	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1117	break;
1118	}
1119	break;
1120
1121	case PT_WORD:
1122	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
1123	PRIV(ucp_gentype)[prop->chartype] == ucp_N \|\|
1124	c == CHAR_UNDERSCORE;
1125	break;
1126
1127	case PT_CLIST:
1128	cp = PRIV(ucd_caseless_sets) + code[`2`];
1129	for (;;)
1130	{
1131	if (c < cp) { OK = FALSE; break*; }
1132	if (c == cp++) { OK = TRUE; break*; }
1133	}
1134	break;
1135
1136	case PT_UCNC:
1137	OK = c == CHAR_DOLLAR_SIGN \|\| c == CHAR_COMMERCIAL_AT \|\|
1138	c == CHAR_GRAVE_ACCENT \|\| (c >= `0xa0` && c <= `0xd7ff`) \|\|
1139	c >= `0xe000`;
1140	break;
1141
1142	/ Should never occur, but keep compilers from grumbling. /
1143
1144	default:
1145	OK = codevalue != OP_PROP;
1146	break;
1147	}
1148
1149	if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + `3`, `0`); }
1150	}
1151	break;
1152	#endif
1153
1154
1155
1156	/ ========================================================================== /
1157	/ These opcodes likewise inspect the subject character, but have an*
1158	argument that is not a data character. It is one of these opcodes:
1159	OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1160	OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. /*
1161
1162	case OP_TYPEPLUS:
1163	case OP_TYPEMINPLUS:
1164	case OP_TYPEPOSPLUS:
1165	count = current_state->count; / Already matched /
1166	if (count > `0`) { ADD_ACTIVE(state_offset + `2`, `0`); }
1167	if (clen > `0`)
1168	{
1169	if (d == OP_ANY && ptr + `1` >= md->end_subject &&
1170	(md->moptions & (PCRE_PARTIAL_HARD)) != `0` &&
1171	NLBLOCK->nltype == NLTYPE_FIXED &&
1172	NLBLOCK->nllen == `2` &&
1173	c == NLBLOCK->nl[`0`])
1174	{
1175	could_continue = partial_newline = TRUE;
1176	}
1177	else if ((c >= `256` && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) \|\|
1178	(c < `256` &&
1179	(d != OP_ANY \|\| !IS_NEWLINE(ptr)) &&
1180	((ctypes[c] & toptable1[d]) ^ toptable2[d]) != `0`))
1181	{
1182	if (count > `0` && codevalue == OP_TYPEPOSPLUS)
1183	{
1184	active_count--; / Remove non-match possibility /
1185	next_active_state--;
1186	}
1187	count++;
1188	ADD_NEW(state_offset, count);
1189	}
1190	}
1191	break;
1192
1193	/-----------------------------------------------------------------/
1194	case OP_TYPEQUERY:
1195	case OP_TYPEMINQUERY:
1196	case OP_TYPEPOSQUERY:
1197	ADD_ACTIVE(state_offset + `2`, `0`);
1198	if (clen > `0`)
1199	{
1200	if (d == OP_ANY && ptr + `1` >= md->end_subject &&
1201	(md->moptions & (PCRE_PARTIAL_HARD)) != `0` &&
1202	NLBLOCK->nltype == NLTYPE_FIXED &&
1203	NLBLOCK->nllen == `2` &&
1204	c == NLBLOCK->nl[`0`])
1205	{
1206	could_continue = partial_newline = TRUE;
1207	}
1208	else if ((c >= `256` && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) \|\|
1209	(c < `256` &&
1210	(d != OP_ANY \|\| !IS_NEWLINE(ptr)) &&
1211	((ctypes[c] & toptable1[d]) ^ toptable2[d]) != `0`))
1212	{
1213	if (codevalue == OP_TYPEPOSQUERY)
1214	{
1215	active_count--; / Remove non-match possibility /
1216	next_active_state--;
1217	}
1218	ADD_NEW(state_offset + `2`, `0`);
1219	}
1220	}
1221	break;
1222
1223	/-----------------------------------------------------------------/
1224	case OP_TYPESTAR:
1225	case OP_TYPEMINSTAR:
1226	case OP_TYPEPOSSTAR:
1227	ADD_ACTIVE(state_offset + `2`, `0`);
1228	if (clen > `0`)
1229	{
1230	if (d == OP_ANY && ptr + `1` >= md->end_subject &&
1231	(md->moptions & (PCRE_PARTIAL_HARD)) != `0` &&
1232	NLBLOCK->nltype == NLTYPE_FIXED &&
1233	NLBLOCK->nllen == `2` &&
1234	c == NLBLOCK->nl[`0`])
1235	{
1236	could_continue = partial_newline = TRUE;
1237	}
1238	else if ((c >= `256` && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) \|\|
1239	(c < `256` &&
1240	(d != OP_ANY \|\| !IS_NEWLINE(ptr)) &&
1241	((ctypes[c] & toptable1[d]) ^ toptable2[d]) != `0`))
1242	{
1243	if (codevalue == OP_TYPEPOSSTAR)
1244	{
1245	active_count--; / Remove non-match possibility /
1246	next_active_state--;
1247	}
1248	ADD_NEW(state_offset, `0`);
1249	}
1250	}
1251	break;
1252
1253	/-----------------------------------------------------------------/
1254	case OP_TYPEEXACT:
1255	count = current_state->count; / Number already matched /
1256	if (clen > `0`)
1257	{
1258	if (d == OP_ANY && ptr + `1` >= md->end_subject &&
1259	(md->moptions & (PCRE_PARTIAL_HARD)) != `0` &&
1260	NLBLOCK->nltype == NLTYPE_FIXED &&
1261	NLBLOCK->nllen == `2` &&
1262	c == NLBLOCK->nl[`0`])
1263	{
1264	could_continue = partial_newline = TRUE;
1265	}
1266	else if ((c >= `256` && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) \|\|
1267	(c < `256` &&
1268	(d != OP_ANY \|\| !IS_NEWLINE(ptr)) &&
1269	((ctypes[c] & toptable1[d]) ^ toptable2[d]) != `0`))
1270	{
1271	if (++count >= (int)GET2(code, `1`))
1272	{ ADD_NEW(state_offset + `1` + IMM2_SIZE + `1`, `0`); }
1273	else
1274	{ ADD_NEW(state_offset, count); }
1275	}
1276	}
1277	break;
1278
1279	/-----------------------------------------------------------------/
1280	case OP_TYPEUPTO:
1281	case OP_TYPEMINUPTO:
1282	case OP_TYPEPOSUPTO:
1283	ADD_ACTIVE(state_offset + `2` + IMM2_SIZE, `0`);
1284	count = current_state->count; / Number already matched /
1285	if (clen > `0`)
1286	{
1287	if (d == OP_ANY && ptr + `1` >= md->end_subject &&
1288	(md->moptions & (PCRE_PARTIAL_HARD)) != `0` &&
1289	NLBLOCK->nltype == NLTYPE_FIXED &&
1290	NLBLOCK->nllen == `2` &&
1291	c == NLBLOCK->nl[`0`])
1292	{
1293	could_continue = partial_newline = TRUE;
1294	}
1295	else if ((c >= `256` && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) \|\|
1296	(c < `256` &&
1297	(d != OP_ANY \|\| !IS_NEWLINE(ptr)) &&
1298	((ctypes[c] & toptable1[d]) ^ toptable2[d]) != `0`))
1299	{
1300	if (codevalue == OP_TYPEPOSUPTO)
1301	{
1302	active_count--; / Remove non-match possibility /
1303	next_active_state--;
1304	}
1305	if (++count >= (int)GET2(code, `1`))
1306	{ ADD_NEW(state_offset + `2` + IMM2_SIZE, `0`); }
1307	else
1308	{ ADD_NEW(state_offset, count); }
1309	}
1310	}
1311	break;
1312
1313	/ ========================================================================== /
1314	/ These are virtual opcodes that are used when something like*
1315	OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1316	argument. It keeps the code above fast for the other cases. The argument
1317	is in the d variable. /*
1318
1319	#ifdef SUPPORT_UCP
1320	case OP_PROP_EXTRA + OP_TYPEPLUS:
1321	case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1322	case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1323	count = current_state->count; / Already matched /
1324	if (count > `0`) { ADD_ACTIVE(state_offset + `4`, `0`); }
1325	if (clen > `0`)
1326	{
1327	BOOL OK;
1328	const pcre_uint32 *cp;
1329	const ucd_record * prop = GET_UCD(c);
1330	switch(code[`2`])
1331	{
1332	case PT_ANY:
1333	OK = TRUE;
1334	break;
1335
1336	case PT_LAMP:
1337	OK = prop->chartype == ucp_Lu \|\| prop->chartype == ucp_Ll \|\|
1338	prop->chartype == ucp_Lt;
1339	break;
1340
1341	case PT_GC:
1342	OK = PRIV(ucp_gentype)[prop->chartype] == code[`3`];
1343	break;
1344
1345	case PT_PC:
1346	OK = prop->chartype == code[`3`];
1347	break;
1348
1349	case PT_SC:
1350	OK = prop->script == code[`3`];
1351	break;
1352
1353	/ These are specials for combination cases. /
1354
1355	case PT_ALNUM:
1356	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
1357	PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1358	break;
1359
1360	/ Perl space used to exclude VT, but from Perl 5.18 it is included,*
1361	which means that Perl space and POSIX space are now identical. PCRE
1362	was changed at release 8.34. /*
1363
1364	case PT_SPACE: / Perl space /
1365	case PT_PXSPACE: / POSIX space /
1366	switch(c)
1367	{
1368	HSPACE_CASES:
1369	VSPACE_CASES:
1370	OK = TRUE;
1371	break;
1372
1373	default:
1374	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1375	break;
1376	}
1377	break;
1378
1379	case PT_WORD:
1380	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
1381	PRIV(ucp_gentype)[prop->chartype] == ucp_N \|\|
1382	c == CHAR_UNDERSCORE;
1383	break;
1384
1385	case PT_CLIST:
1386	cp = PRIV(ucd_caseless_sets) + code[`3`];
1387	for (;;)
1388	{
1389	if (c < cp) { OK = FALSE; break*; }
1390	if (c == cp++) { OK = TRUE; break*; }
1391	}
1392	break;
1393
1394	case PT_UCNC:
1395	OK = c == CHAR_DOLLAR_SIGN \|\| c == CHAR_COMMERCIAL_AT \|\|
1396	c == CHAR_GRAVE_ACCENT \|\| (c >= `0xa0` && c <= `0xd7ff`) \|\|
1397	c >= `0xe000`;
1398	break;
1399
1400	/ Should never occur, but keep compilers from grumbling. /
1401
1402	default:
1403	OK = codevalue != OP_PROP;
1404	break;
1405	}
1406
1407	if (OK == (d == OP_PROP))
1408	{
1409	if (count > `0` && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1410	{
1411	active_count--; / Remove non-match possibility /
1412	next_active_state--;
1413	}
1414	count++;
1415	ADD_NEW(state_offset, count);
1416	}
1417	}
1418	break;
1419
1420	/-----------------------------------------------------------------/
1421	case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1422	case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1423	case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1424	count = current_state->count; / Already matched /
1425	if (count > `0`) { ADD_ACTIVE(state_offset + `2`, `0`); }
1426	if (clen > `0`)
1427	{
1428	int lgb, rgb;
1429	const pcre_uchar *nptr = ptr + clen;
1430	int ncount = `0`;
1431	if (count > `0` && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1432	{
1433	active_count--; / Remove non-match possibility /
1434	next_active_state--;
1435	}
1436	lgb = UCD_GRAPHBREAK(c);
1437	while (nptr < end_subject)
1438	{
1439	dlen = `1`;
1440	if (!utf) d = nptr; else* { GETCHARLEN(d, nptr, dlen); }
1441	rgb = UCD_GRAPHBREAK(d);
1442	if ((PRIV(ucp_gbtable)[lgb] & (`1` << rgb)) == `0`) break;
1443	ncount++;
1444	lgb = rgb;
1445	nptr += dlen;
1446	}
1447	count++;
1448	ADD_NEW_DATA(-state_offset, count, ncount);
1449	}
1450	break;
1451	#endif
1452
1453	/-----------------------------------------------------------------/
1454	case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1455	case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1456	case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1457	count = current_state->count; / Already matched /
1458	if (count > `0`) { ADD_ACTIVE(state_offset + `2`, `0`); }
1459	if (clen > `0`)
1460	{
1461	int ncount = `0`;
1462	switch (c)
1463	{
1464	case CHAR_VT:
1465	case CHAR_FF:
1466	case CHAR_NEL:
1467	#ifndef EBCDIC
1468	case `0x2028`:
1469	case `0x2029`:
1470	#endif /* Not EBCDIC */
1471	if ((md->moptions & PCRE_BSR_ANYCRLF) != `0`) break;
1472	goto ANYNL01;
1473
1474	case CHAR_CR:
1475	if (ptr + `1` < end_subject && UCHAR21TEST(ptr + `1`) == CHAR_LF) ncount = `1`;
1476	/ Fall through /
1477
1478	ANYNL01:
1479	case CHAR_LF:
1480	if (count > `0` && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1481	{
1482	active_count--; / Remove non-match possibility /
1483	next_active_state--;
1484	}
1485	count++;
1486	ADD_NEW_DATA(-state_offset, count, ncount);
1487	break;
1488
1489	default:
1490	break;
1491	}
1492	}
1493	break;
1494
1495	/-----------------------------------------------------------------/
1496	case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1497	case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1498	case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1499	count = current_state->count; / Already matched /
1500	if (count > `0`) { ADD_ACTIVE(state_offset + `2`, `0`); }
1501	if (clen > `0`)
1502	{
1503	BOOL OK;
1504	switch (c)
1505	{
1506	VSPACE_CASES:
1507	OK = TRUE;
1508	break;
1509
1510	default:
1511	OK = FALSE;
1512	break;
1513	}
1514
1515	if (OK == (d == OP_VSPACE))
1516	{
1517	if (count > `0` && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1518	{
1519	active_count--; / Remove non-match possibility /
1520	next_active_state--;
1521	}
1522	count++;
1523	ADD_NEW_DATA(-state_offset, count, `0`);
1524	}
1525	}
1526	break;
1527
1528	/-----------------------------------------------------------------/
1529	case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1530	case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1531	case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1532	count = current_state->count; / Already matched /
1533	if (count > `0`) { ADD_ACTIVE(state_offset + `2`, `0`); }
1534	if (clen > `0`)
1535	{
1536	BOOL OK;
1537	switch (c)
1538	{
1539	HSPACE_CASES:
1540	OK = TRUE;
1541	break;
1542
1543	default:
1544	OK = FALSE;
1545	break;
1546	}
1547
1548	if (OK == (d == OP_HSPACE))
1549	{
1550	if (count > `0` && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1551	{
1552	active_count--; / Remove non-match possibility /
1553	next_active_state--;
1554	}
1555	count++;
1556	ADD_NEW_DATA(-state_offset, count, `0`);
1557	}
1558	}
1559	break;
1560
1561	/-----------------------------------------------------------------/
1562	#ifdef SUPPORT_UCP
1563	case OP_PROP_EXTRA + OP_TYPEQUERY:
1564	case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1565	case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1566	count = `4`;
1567	goto QS1;
1568
1569	case OP_PROP_EXTRA + OP_TYPESTAR:
1570	case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1571	case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1572	count = `0`;
1573
1574	QS1:
1575
1576	ADD_ACTIVE(state_offset + `4`, `0`);
1577	if (clen > `0`)
1578	{
1579	BOOL OK;
1580	const pcre_uint32 *cp;
1581	const ucd_record * prop = GET_UCD(c);
1582	switch(code[`2`])
1583	{
1584	case PT_ANY:
1585	OK = TRUE;
1586	break;
1587
1588	case PT_LAMP:
1589	OK = prop->chartype == ucp_Lu \|\| prop->chartype == ucp_Ll \|\|
1590	prop->chartype == ucp_Lt;
1591	break;
1592
1593	case PT_GC:
1594	OK = PRIV(ucp_gentype)[prop->chartype] == code[`3`];
1595	break;
1596
1597	case PT_PC:
1598	OK = prop->chartype == code[`3`];
1599	break;
1600
1601	case PT_SC:
1602	OK = prop->script == code[`3`];
1603	break;
1604
1605	/ These are specials for combination cases. /
1606
1607	case PT_ALNUM:
1608	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
1609	PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1610	break;
1611
1612	/ Perl space used to exclude VT, but from Perl 5.18 it is included,*
1613	which means that Perl space and POSIX space are now identical. PCRE
1614	was changed at release 8.34. /*
1615
1616	case PT_SPACE: / Perl space /
1617	case PT_PXSPACE: / POSIX space /
1618	switch(c)
1619	{
1620	HSPACE_CASES:
1621	VSPACE_CASES:
1622	OK = TRUE;
1623	break;
1624
1625	default:
1626	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1627	break;
1628	}
1629	break;
1630
1631	case PT_WORD:
1632	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
1633	PRIV(ucp_gentype)[prop->chartype] == ucp_N \|\|
1634	c == CHAR_UNDERSCORE;
1635	break;
1636
1637	case PT_CLIST:
1638	cp = PRIV(ucd_caseless_sets) + code[`3`];
1639	for (;;)
1640	{
1641	if (c < cp) { OK = FALSE; break*; }
1642	if (c == cp++) { OK = TRUE; break*; }
1643	}
1644	break;
1645
1646	case PT_UCNC:
1647	OK = c == CHAR_DOLLAR_SIGN \|\| c == CHAR_COMMERCIAL_AT \|\|
1648	c == CHAR_GRAVE_ACCENT \|\| (c >= `0xa0` && c <= `0xd7ff`) \|\|
1649	c >= `0xe000`;
1650	break;
1651
1652	/ Should never occur, but keep compilers from grumbling. /
1653
1654	default:
1655	OK = codevalue != OP_PROP;
1656	break;
1657	}
1658
1659	if (OK == (d == OP_PROP))
1660	{
1661	if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR \|\|
1662	codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1663	{
1664	active_count--; / Remove non-match possibility /
1665	next_active_state--;
1666	}
1667	ADD_NEW(state_offset + count, `0`);
1668	}
1669	}
1670	break;
1671
1672	/-----------------------------------------------------------------/
1673	case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1674	case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1675	case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1676	count = `2`;
1677	goto QS2;
1678
1679	case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1680	case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1681	case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1682	count = `0`;
1683
1684	QS2:
1685
1686	ADD_ACTIVE(state_offset + `2`, `0`);
1687	if (clen > `0`)
1688	{
1689	int lgb, rgb;
1690	const pcre_uchar *nptr = ptr + clen;
1691	int ncount = `0`;
1692	if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR \|\|
1693	codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1694	{
1695	active_count--; / Remove non-match possibility /
1696	next_active_state--;
1697	}
1698	lgb = UCD_GRAPHBREAK(c);
1699	while (nptr < end_subject)
1700	{
1701	dlen = `1`;
1702	if (!utf) d = nptr; else* { GETCHARLEN(d, nptr, dlen); }
1703	rgb = UCD_GRAPHBREAK(d);
1704	if ((PRIV(ucp_gbtable)[lgb] & (`1` << rgb)) == `0`) break;
1705	ncount++;
1706	lgb = rgb;
1707	nptr += dlen;
1708	}
1709	ADD_NEW_DATA(-(state_offset + count), `0`, ncount);
1710	}
1711	break;
1712	#endif
1713
1714	/-----------------------------------------------------------------/
1715	case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1716	case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1717	case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1718	count = `2`;
1719	goto QS3;
1720
1721	case OP_ANYNL_EXTRA + OP_TYPESTAR:
1722	case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1723	case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1724	count = `0`;
1725
1726	QS3:
1727	ADD_ACTIVE(state_offset + `2`, `0`);
1728	if (clen > `0`)
1729	{
1730	int ncount = `0`;
1731	switch (c)
1732	{
1733	case CHAR_VT:
1734	case CHAR_FF:
1735	case CHAR_NEL:
1736	#ifndef EBCDIC
1737	case `0x2028`:
1738	case `0x2029`:
1739	#endif /* Not EBCDIC */
1740	if ((md->moptions & PCRE_BSR_ANYCRLF) != `0`) break;
1741	goto ANYNL02;
1742
1743	case CHAR_CR:
1744	if (ptr + `1` < end_subject && UCHAR21TEST(ptr + `1`) == CHAR_LF) ncount = `1`;
1745	/ Fall through /
1746
1747	ANYNL02:
1748	case CHAR_LF:
1749	if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR \|\|
1750	codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1751	{
1752	active_count--; / Remove non-match possibility /
1753	next_active_state--;
1754	}
1755	ADD_NEW_DATA(-(state_offset + (int)count), `0`, ncount);
1756	break;
1757
1758	default:
1759	break;
1760	}
1761	}
1762	break;
1763
1764	/-----------------------------------------------------------------/
1765	case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1766	case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1767	case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1768	count = `2`;
1769	goto QS4;
1770
1771	case OP_VSPACE_EXTRA + OP_TYPESTAR:
1772	case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1773	case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1774	count = `0`;
1775
1776	QS4:
1777	ADD_ACTIVE(state_offset + `2`, `0`);
1778	if (clen > `0`)
1779	{
1780	BOOL OK;
1781	switch (c)
1782	{
1783	VSPACE_CASES:
1784	OK = TRUE;
1785	break;
1786
1787	default:
1788	OK = FALSE;
1789	break;
1790	}
1791	if (OK == (d == OP_VSPACE))
1792	{
1793	if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR \|\|
1794	codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1795	{
1796	active_count--; / Remove non-match possibility /
1797	next_active_state--;
1798	}
1799	ADD_NEW_DATA(-(state_offset + (int)count), `0`, `0`);
1800	}
1801	}
1802	break;
1803
1804	/-----------------------------------------------------------------/
1805	case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1806	case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1807	case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1808	count = `2`;
1809	goto QS5;
1810
1811	case OP_HSPACE_EXTRA + OP_TYPESTAR:
1812	case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1813	case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1814	count = `0`;
1815
1816	QS5:
1817	ADD_ACTIVE(state_offset + `2`, `0`);
1818	if (clen > `0`)
1819	{
1820	BOOL OK;
1821	switch (c)
1822	{
1823	HSPACE_CASES:
1824	OK = TRUE;
1825	break;
1826
1827	default:
1828	OK = FALSE;
1829	break;
1830	}
1831
1832	if (OK == (d == OP_HSPACE))
1833	{
1834	if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR \|\|
1835	codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1836	{
1837	active_count--; / Remove non-match possibility /
1838	next_active_state--;
1839	}
1840	ADD_NEW_DATA(-(state_offset + (int)count), `0`, `0`);
1841	}
1842	}
1843	break;
1844
1845	/-----------------------------------------------------------------/
1846	#ifdef SUPPORT_UCP
1847	case OP_PROP_EXTRA + OP_TYPEEXACT:
1848	case OP_PROP_EXTRA + OP_TYPEUPTO:
1849	case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1850	case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1851	if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1852	{ ADD_ACTIVE(state_offset + `1` + IMM2_SIZE + `3`, `0`); }
1853	count = current_state->count; / Number already matched /
1854	if (clen > `0`)
1855	{
1856	BOOL OK;
1857	const pcre_uint32 *cp;
1858	const ucd_record * prop = GET_UCD(c);
1859	switch(code[`1` + IMM2_SIZE + `1`])
1860	{
1861	case PT_ANY:
1862	OK = TRUE;
1863	break;
1864
1865	case PT_LAMP:
1866	OK = prop->chartype == ucp_Lu \|\| prop->chartype == ucp_Ll \|\|
1867	prop->chartype == ucp_Lt;
1868	break;
1869
1870	case PT_GC:
1871	OK = PRIV(ucp_gentype)[prop->chartype] == code[`1` + IMM2_SIZE + `2`];
1872	break;
1873
1874	case PT_PC:
1875	OK = prop->chartype == code[`1` + IMM2_SIZE + `2`];
1876	break;
1877
1878	case PT_SC:
1879	OK = prop->script == code[`1` + IMM2_SIZE + `2`];
1880	break;
1881
1882	/ These are specials for combination cases. /
1883
1884	case PT_ALNUM:
1885	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
1886	PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1887	break;
1888
1889	/ Perl space used to exclude VT, but from Perl 5.18 it is included,*
1890	which means that Perl space and POSIX space are now identical. PCRE
1891	was changed at release 8.34. /*
1892
1893	case PT_SPACE: / Perl space /
1894	case PT_PXSPACE: / POSIX space /
1895	switch(c)
1896	{
1897	HSPACE_CASES:
1898	VSPACE_CASES:
1899	OK = TRUE;
1900	break;
1901
1902	default:
1903	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1904	break;
1905	}
1906	break;
1907
1908	case PT_WORD:
1909	OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
1910	PRIV(ucp_gentype)[prop->chartype] == ucp_N \|\|
1911	c == CHAR_UNDERSCORE;
1912	break;
1913
1914	case PT_CLIST:
1915	cp = PRIV(ucd_caseless_sets) + code[`1` + IMM2_SIZE + `2`];
1916	for (;;)
1917	{
1918	if (c < cp) { OK = FALSE; break*; }
1919	if (c == cp++) { OK = TRUE; break*; }
1920	}
1921	break;
1922
1923	case PT_UCNC:
1924	OK = c == CHAR_DOLLAR_SIGN \|\| c == CHAR_COMMERCIAL_AT \|\|
1925	c == CHAR_GRAVE_ACCENT \|\| (c >= `0xa0` && c <= `0xd7ff`) \|\|
1926	c >= `0xe000`;
1927	break;
1928
1929	/ Should never occur, but keep compilers from grumbling. /
1930
1931	default:
1932	OK = codevalue != OP_PROP;
1933	break;
1934	}
1935
1936	if (OK == (d == OP_PROP))
1937	{
1938	if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1939	{
1940	active_count--; / Remove non-match possibility /
1941	next_active_state--;
1942	}
1943	if (++count >= (int)GET2(code, `1`))
1944	{ ADD_NEW(state_offset + `1` + IMM2_SIZE + `3`, `0`); }
1945	else
1946	{ ADD_NEW(state_offset, count); }
1947	}
1948	}
1949	break;
1950
1951	/-----------------------------------------------------------------/
1952	case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1953	case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1954	case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1955	case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1956	if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1957	{ ADD_ACTIVE(state_offset + `2` + IMM2_SIZE, `0`); }
1958	count = current_state->count; / Number already matched /
1959	if (clen > `0`)
1960	{
1961	int lgb, rgb;
1962	const pcre_uchar *nptr = ptr + clen;
1963	int ncount = `0`;
1964	if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1965	{
1966	active_count--; / Remove non-match possibility /
1967	next_active_state--;
1968	}
1969	lgb = UCD_GRAPHBREAK(c);
1970	while (nptr < end_subject)
1971	{
1972	dlen = `1`;
1973	if (!utf) d = nptr; else* { GETCHARLEN(d, nptr, dlen); }
1974	rgb = UCD_GRAPHBREAK(d);
1975	if ((PRIV(ucp_gbtable)[lgb] & (`1` << rgb)) == `0`) break;
1976	ncount++;
1977	lgb = rgb;
1978	nptr += dlen;
1979	}
1980	if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != `0`)
1981	reset_could_continue = TRUE;
1982	if (++count >= (int)GET2(code, `1`))
1983	{ ADD_NEW_DATA(-(state_offset + `2` + IMM2_SIZE), `0`, ncount); }
1984	else
1985	{ ADD_NEW_DATA(-state_offset, count, ncount); }
1986	}
1987	break;
1988	#endif
1989
1990	/-----------------------------------------------------------------/
1991	case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1992	case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1993	case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1994	case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1995	if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1996	{ ADD_ACTIVE(state_offset + `2` + IMM2_SIZE, `0`); }
1997	count = current_state->count; / Number already matched /
1998	if (clen > `0`)
1999	{
2000	int ncount = `0`;
2001	switch (c)
2002	{
2003	case CHAR_VT:
2004	case CHAR_FF:
2005	case CHAR_NEL:
2006	#ifndef EBCDIC
2007	case `0x2028`:
2008	case `0x2029`:
2009	#endif /* Not EBCDIC */
2010	if ((md->moptions & PCRE_BSR_ANYCRLF) != `0`) break;
2011	goto ANYNL03;
2012
2013	case CHAR_CR:
2014	if (ptr + `1` < end_subject && UCHAR21TEST(ptr + `1`) == CHAR_LF) ncount = `1`;
2015	/ Fall through /
2016
2017	ANYNL03:
2018	case CHAR_LF:
2019	if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2020	{
2021	active_count--; / Remove non-match possibility /
2022	next_active_state--;
2023	}
2024	if (++count >= (int)GET2(code, `1`))
2025	{ ADD_NEW_DATA(-(state_offset + `2` + IMM2_SIZE), `0`, ncount); }
2026	else
2027	{ ADD_NEW_DATA(-state_offset, count, ncount); }
2028	break;
2029
2030	default:
2031	break;
2032	}
2033	}
2034	break;
2035
2036	/-----------------------------------------------------------------/
2037	case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2038	case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2039	case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2040	case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2041	if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2042	{ ADD_ACTIVE(state_offset + `2` + IMM2_SIZE, `0`); }
2043	count = current_state->count; / Number already matched /
2044	if (clen > `0`)
2045	{
2046	BOOL OK;
2047	switch (c)
2048	{
2049	VSPACE_CASES:
2050	OK = TRUE;
2051	break;
2052
2053	default:
2054	OK = FALSE;
2055	}
2056
2057	if (OK == (d == OP_VSPACE))
2058	{
2059	if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2060	{
2061	active_count--; / Remove non-match possibility /
2062	next_active_state--;
2063	}
2064	if (++count >= (int)GET2(code, `1`))
2065	{ ADD_NEW_DATA(-(state_offset + `2` + IMM2_SIZE), `0`, `0`); }
2066	else
2067	{ ADD_NEW_DATA(-state_offset, count, `0`); }
2068	}
2069	}
2070	break;
2071
2072	/-----------------------------------------------------------------/
2073	case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2074	case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2075	case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2076	case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2077	if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2078	{ ADD_ACTIVE(state_offset + `2` + IMM2_SIZE, `0`); }
2079	count = current_state->count; / Number already matched /
2080	if (clen > `0`)
2081	{
2082	BOOL OK;
2083	switch (c)
2084	{
2085	HSPACE_CASES:
2086	OK = TRUE;
2087	break;
2088
2089	default:
2090	OK = FALSE;
2091	break;
2092	}
2093
2094	if (OK == (d == OP_HSPACE))
2095	{
2096	if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2097	{
2098	active_count--; / Remove non-match possibility /
2099	next_active_state--;
2100	}
2101	if (++count >= (int)GET2(code, `1`))
2102	{ ADD_NEW_DATA(-(state_offset + `2` + IMM2_SIZE), `0`, `0`); }
2103	else
2104	{ ADD_NEW_DATA(-state_offset, count, `0`); }
2105	}
2106	}
2107	break;
2108
2109	/ ========================================================================== /
2110	/ These opcodes are followed by a character that is usually compared*
2111	to the current subject character; it is loaded into d. We still get
2112	here even if there is no subject character, because in some cases zero
2113	repetitions are permitted. /*
2114
2115	/-----------------------------------------------------------------/
2116	case OP_CHAR:
2117	if (clen > `0` && c == d) { ADD_NEW(state_offset + dlen + `1`, `0`); }
2118	break;
2119
2120	/-----------------------------------------------------------------/
2121	case OP_CHARI:
2122	if (clen == `0`) break;
2123
2124	#ifdef SUPPORT_UTF
2125	if (utf)
2126	{
2127	if (c == d) { ADD_NEW(state_offset + dlen + `1`, `0`); } else
2128	{
2129	unsigned int othercase;
2130	if (c < `128`)
2131	othercase = fcc[c];
2132	else
2133	/ If we have Unicode property support, we can use it to test the*
2134	other case of the character. /*
2135	#ifdef SUPPORT_UCP
2136	othercase = UCD_OTHERCASE(c);
2137	#else
2138	othercase = NOTACHAR;
2139	#endif
2140
2141	if (d == othercase) { ADD_NEW(state_offset + dlen + `1`, `0`); }
2142	}
2143	}
2144	else
2145	#endif /* SUPPORT_UTF */
2146	/ Not UTF mode /
2147	{
2148	if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2149	{ ADD_NEW(state_offset + `2`, `0`); }
2150	}
2151	break;
2152
2153
2154	#ifdef SUPPORT_UCP
2155	/-----------------------------------------------------------------/
2156	/ This is a tricky one because it can match more than one character.*
2157	Find out how many characters to skip, and then set up a negative state
2158	to wait for them to pass before continuing. /*
2159
2160	case OP_EXTUNI:
2161	if (clen > `0`)
2162	{
2163	int lgb, rgb;
2164	const pcre_uchar *nptr = ptr + clen;
2165	int ncount = `0`;
2166	lgb = UCD_GRAPHBREAK(c);
2167	while (nptr < end_subject)
2168	{
2169	dlen = `1`;
2170	if (!utf) d = nptr; else* { GETCHARLEN(d, nptr, dlen); }
2171	rgb = UCD_GRAPHBREAK(d);
2172	if ((PRIV(ucp_gbtable)[lgb] & (`1` << rgb)) == `0`) break;
2173	ncount++;
2174	lgb = rgb;
2175	nptr += dlen;
2176	}
2177	if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != `0`)
2178	reset_could_continue = TRUE;
2179	ADD_NEW_DATA(-(state_offset + `1`), `0`, ncount);
2180	}
2181	break;
2182	#endif
2183
2184	/-----------------------------------------------------------------/
2185	/ This is a tricky like EXTUNI because it too can match more than one*
2186	character (when CR is followed by LF). In this case, set up a negative
2187	state to wait for one character to pass before continuing. /*
2188
2189	case OP_ANYNL:
2190	if (clen > `0`) switch(c)
2191	{
2192	case CHAR_VT:
2193	case CHAR_FF:
2194	case CHAR_NEL:
2195	#ifndef EBCDIC
2196	case `0x2028`:
2197	case `0x2029`:
2198	#endif /* Not EBCDIC */
2199	if ((md->moptions & PCRE_BSR_ANYCRLF) != `0`) break;
2200
2201	case CHAR_LF:
2202	ADD_NEW(state_offset + `1`, `0`);
2203	break;
2204
2205	case CHAR_CR:
2206	if (ptr + `1` >= end_subject)
2207	{
2208	ADD_NEW(state_offset + `1`, `0`);
2209	if ((md->moptions & PCRE_PARTIAL_HARD) != `0`)
2210	reset_could_continue = TRUE;
2211	}
2212	else if (UCHAR21TEST(ptr + `1`) == CHAR_LF)
2213	{
2214	ADD_NEW_DATA(-(state_offset + `1`), `0`, `1`);
2215	}
2216	else
2217	{
2218	ADD_NEW(state_offset + `1`, `0`);
2219	}
2220	break;
2221	}
2222	break;
2223
2224	/-----------------------------------------------------------------/
2225	case OP_NOT_VSPACE:
2226	if (clen > `0`) switch(c)
2227	{
2228	VSPACE_CASES:
2229	break;
2230
2231	default:
2232	ADD_NEW(state_offset + `1`, `0`);
2233	break;
2234	}
2235	break;
2236
2237	/-----------------------------------------------------------------/
2238	case OP_VSPACE:
2239	if (clen > `0`) switch(c)
2240	{
2241	VSPACE_CASES:
2242	ADD_NEW(state_offset + `1`, `0`);
2243	break;
2244
2245	default:
2246	break;
2247	}
2248	break;
2249
2250	/-----------------------------------------------------------------/
2251	case OP_NOT_HSPACE:
2252	if (clen > `0`) switch(c)
2253	{
2254	HSPACE_CASES:
2255	break;
2256
2257	default:
2258	ADD_NEW(state_offset + `1`, `0`);
2259	break;
2260	}
2261	break;
2262
2263	/-----------------------------------------------------------------/
2264	case OP_HSPACE:
2265	if (clen > `0`) switch(c)
2266	{
2267	HSPACE_CASES:
2268	ADD_NEW(state_offset + `1`, `0`);
2269	break;
2270
2271	default:
2272	break;
2273	}
2274	break;
2275
2276	/-----------------------------------------------------------------/
2277	/ Match a negated single character casefully. /
2278
2279	case OP_NOT:
2280	if (clen > `0` && c != d) { ADD_NEW(state_offset + dlen + `1`, `0`); }
2281	break;
2282
2283	/-----------------------------------------------------------------/
2284	/ Match a negated single character caselessly. /
2285
2286	case OP_NOTI:
2287	if (clen > `0`)
2288	{
2289	unsigned int otherd;
2290	#ifdef SUPPORT_UTF
2291	if (utf && d >= `128`)
2292	{
2293	#ifdef SUPPORT_UCP
2294	otherd = UCD_OTHERCASE(d);
2295	#endif /* SUPPORT_UCP */
2296	}
2297	else
2298	#endif /* SUPPORT_UTF */
2299	otherd = TABLE_GET(d, fcc, d);
2300	if (c != d && c != otherd)
2301	{ ADD_NEW(state_offset + dlen + `1`, `0`); }
2302	}
2303	break;
2304
2305	/-----------------------------------------------------------------/
2306	case OP_PLUSI:
2307	case OP_MINPLUSI:
2308	case OP_POSPLUSI:
2309	case OP_NOTPLUSI:
2310	case OP_NOTMINPLUSI:
2311	case OP_NOTPOSPLUSI:
2312	caseless = TRUE;
2313	codevalue -= OP_STARI - OP_STAR;
2314
2315	/ Fall through /
2316	case OP_PLUS:
2317	case OP_MINPLUS:
2318	case OP_POSPLUS:
2319	case OP_NOTPLUS:
2320	case OP_NOTMINPLUS:
2321	case OP_NOTPOSPLUS:
2322	count = current_state->count; / Already matched /
2323	if (count > `0`) { ADD_ACTIVE(state_offset + dlen + `1`, `0`); }
2324	if (clen > `0`)
2325	{
2326	pcre_uint32 otherd = NOTACHAR;
2327	if (caseless)
2328	{
2329	#ifdef SUPPORT_UTF
2330	if (utf && d >= `128`)
2331	{
2332	#ifdef SUPPORT_UCP
2333	otherd = UCD_OTHERCASE(d);
2334	#endif /* SUPPORT_UCP */
2335	}
2336	else
2337	#endif /* SUPPORT_UTF */
2338	otherd = TABLE_GET(d, fcc, d);
2339	}
2340	if ((c == d \|\| c == otherd) == (codevalue < OP_NOTSTAR))
2341	{
2342	if (count > `0` &&
2343	(codevalue == OP_POSPLUS \|\| codevalue == OP_NOTPOSPLUS))
2344	{
2345	active_count--; / Remove non-match possibility /
2346	next_active_state--;
2347	}
2348	count++;
2349	ADD_NEW(state_offset, count);
2350	}
2351	}
2352	break;
2353
2354	/-----------------------------------------------------------------/
2355	case OP_QUERYI:
2356	case OP_MINQUERYI:
2357	case OP_POSQUERYI:
2358	case OP_NOTQUERYI:
2359	case OP_NOTMINQUERYI:
2360	case OP_NOTPOSQUERYI:
2361	caseless = TRUE;
2362	codevalue -= OP_STARI - OP_STAR;
2363	/ Fall through /
2364	case OP_QUERY:
2365	case OP_MINQUERY:
2366	case OP_POSQUERY:
2367	case OP_NOTQUERY:
2368	case OP_NOTMINQUERY:
2369	case OP_NOTPOSQUERY:
2370	ADD_ACTIVE(state_offset + dlen + `1`, `0`);
2371	if (clen > `0`)
2372	{
2373	pcre_uint32 otherd = NOTACHAR;
2374	if (caseless)
2375	{
2376	#ifdef SUPPORT_UTF
2377	if (utf && d >= `128`)
2378	{
2379	#ifdef SUPPORT_UCP
2380	otherd = UCD_OTHERCASE(d);
2381	#endif /* SUPPORT_UCP */
2382	}
2383	else
2384	#endif /* SUPPORT_UTF */
2385	otherd = TABLE_GET(d, fcc, d);
2386	}
2387	if ((c == d \|\| c == otherd) == (codevalue < OP_NOTSTAR))
2388	{
2389	if (codevalue == OP_POSQUERY \|\| codevalue == OP_NOTPOSQUERY)
2390	{
2391	active_count--; / Remove non-match possibility /
2392	next_active_state--;
2393	}
2394	ADD_NEW(state_offset + dlen + `1`, `0`);
2395	}
2396	}
2397	break;
2398
2399	/-----------------------------------------------------------------/
2400	case OP_STARI:
2401	case OP_MINSTARI:
2402	case OP_POSSTARI:
2403	case OP_NOTSTARI:
2404	case OP_NOTMINSTARI:
2405	case OP_NOTPOSSTARI:
2406	caseless = TRUE;
2407	codevalue -= OP_STARI - OP_STAR;
2408	/ Fall through /
2409	case OP_STAR:
2410	case OP_MINSTAR:
2411	case OP_POSSTAR:
2412	case OP_NOTSTAR:
2413	case OP_NOTMINSTAR:
2414	case OP_NOTPOSSTAR:
2415	ADD_ACTIVE(state_offset + dlen + `1`, `0`);
2416	if (clen > `0`)
2417	{
2418	pcre_uint32 otherd = NOTACHAR;
2419	if (caseless)
2420	{
2421	#ifdef SUPPORT_UTF
2422	if (utf && d >= `128`)
2423	{
2424	#ifdef SUPPORT_UCP
2425	otherd = UCD_OTHERCASE(d);
2426	#endif /* SUPPORT_UCP */
2427	}
2428	else
2429	#endif /* SUPPORT_UTF */
2430	otherd = TABLE_GET(d, fcc, d);
2431	}
2432	if ((c == d \|\| c == otherd) == (codevalue < OP_NOTSTAR))
2433	{
2434	if (codevalue == OP_POSSTAR \|\| codevalue == OP_NOTPOSSTAR)
2435	{
2436	active_count--; / Remove non-match possibility /
2437	next_active_state--;
2438	}
2439	ADD_NEW(state_offset, `0`);
2440	}
2441	}
2442	break;
2443
2444	/-----------------------------------------------------------------/
2445	case OP_EXACTI:
2446	case OP_NOTEXACTI:
2447	caseless = TRUE;
2448	codevalue -= OP_STARI - OP_STAR;
2449	/ Fall through /
2450	case OP_EXACT:
2451	case OP_NOTEXACT:
2452	count = current_state->count; / Number already matched /
2453	if (clen > `0`)
2454	{
2455	pcre_uint32 otherd = NOTACHAR;
2456	if (caseless)
2457	{
2458	#ifdef SUPPORT_UTF
2459	if (utf && d >= `128`)
2460	{
2461	#ifdef SUPPORT_UCP
2462	otherd = UCD_OTHERCASE(d);
2463	#endif /* SUPPORT_UCP */
2464	}
2465	else
2466	#endif /* SUPPORT_UTF */
2467	otherd = TABLE_GET(d, fcc, d);
2468	}
2469	if ((c == d \|\| c == otherd) == (codevalue < OP_NOTSTAR))
2470	{
2471	if (++count >= (int)GET2(code, `1`))
2472	{ ADD_NEW(state_offset + dlen + `1` + IMM2_SIZE, `0`); }
2473	else
2474	{ ADD_NEW(state_offset, count); }
2475	}
2476	}
2477	break;
2478
2479	/-----------------------------------------------------------------/
2480	case OP_UPTOI:
2481	case OP_MINUPTOI:
2482	case OP_POSUPTOI:
2483	case OP_NOTUPTOI:
2484	case OP_NOTMINUPTOI:
2485	case OP_NOTPOSUPTOI:
2486	caseless = TRUE;
2487	codevalue -= OP_STARI - OP_STAR;
2488	/ Fall through /
2489	case OP_UPTO:
2490	case OP_MINUPTO:
2491	case OP_POSUPTO:
2492	case OP_NOTUPTO:
2493	case OP_NOTMINUPTO:
2494	case OP_NOTPOSUPTO:
2495	ADD_ACTIVE(state_offset + dlen + `1` + IMM2_SIZE, `0`);
2496	count = current_state->count; / Number already matched /
2497	if (clen > `0`)
2498	{
2499	pcre_uint32 otherd = NOTACHAR;
2500	if (caseless)
2501	{
2502	#ifdef SUPPORT_UTF
2503	if (utf && d >= `128`)
2504	{
2505	#ifdef SUPPORT_UCP
2506	otherd = UCD_OTHERCASE(d);
2507	#endif /* SUPPORT_UCP */
2508	}
2509	else
2510	#endif /* SUPPORT_UTF */
2511	otherd = TABLE_GET(d, fcc, d);
2512	}
2513	if ((c == d \|\| c == otherd) == (codevalue < OP_NOTSTAR))
2514	{
2515	if (codevalue == OP_POSUPTO \|\| codevalue == OP_NOTPOSUPTO)
2516	{
2517	active_count--; / Remove non-match possibility /
2518	next_active_state--;
2519	}
2520	if (++count >= (int)GET2(code, `1`))
2521	{ ADD_NEW(state_offset + dlen + `1` + IMM2_SIZE, `0`); }
2522	else
2523	{ ADD_NEW(state_offset, count); }
2524	}
2525	}
2526	break;
2527
2528
2529	/ ========================================================================== /
2530	/ These are the class-handling opcodes /
2531
2532	case OP_CLASS:
2533	case OP_NCLASS:
2534	case OP_XCLASS:
2535	{
2536	BOOL isinclass = FALSE;
2537	int next_state_offset;
2538	const pcre_uchar *ecode;
2539
2540	/ For a simple class, there is always just a 32-byte table, and we*
2541	can set isinclass from it. /*
2542
2543	if (codevalue != OP_XCLASS)
2544	{
2545	ecode = code + `1` + (`32` / sizeof(pcre_uchar));
2546	if (clen > `0`)
2547	{
2548	isinclass = (c > `255`)? (codevalue == OP_NCLASS) :
2549	((((pcre_uint8 *)(code + `1`))[c/`8`] & (`1` << (c&`7`))) != `0`);
2550	}
2551	}
2552
2553	/ An extended class may have a table or a list of single characters,*
2554	ranges, or both, and it may be positive or negative. There's a
2555	function that sorts all this out. /*
2556
2557	else
2558	{
2559	ecode = code + GET(code, `1`);
2560	if (clen > `0`) isinclass = PRIV(xclass)(c, code + `1` + LINK_SIZE, utf);
2561	}
2562
2563	/ At this point, isinclass is set for all kinds of class, and ecode*
2564	points to the byte after the end of the class. If there is a
2565	quantifier, this is where it will be. /*
2566
2567	next_state_offset = (int)(ecode - start_code);
2568
2569	switch (*ecode)
2570	{
2571	case OP_CRSTAR:
2572	case OP_CRMINSTAR:
2573	case OP_CRPOSSTAR:
2574	ADD_ACTIVE(next_state_offset + `1`, `0`);
2575	if (isinclass)
2576	{
2577	if (*ecode == OP_CRPOSSTAR)
2578	{
2579	active_count--; / Remove non-match possibility /
2580	next_active_state--;
2581	}
2582	ADD_NEW(state_offset, `0`);
2583	}
2584	break;
2585
2586	case OP_CRPLUS:
2587	case OP_CRMINPLUS:
2588	case OP_CRPOSPLUS:
2589	count = current_state->count; / Already matched /
2590	if (count > `0`) { ADD_ACTIVE(next_state_offset + `1`, `0`); }
2591	if (isinclass)
2592	{
2593	if (count > `0` && *ecode == OP_CRPOSPLUS)
2594	{
2595	active_count--; / Remove non-match possibility /
2596	next_active_state--;
2597	}
2598	count++;
2599	ADD_NEW(state_offset, count);
2600	}
2601	break;
2602
2603	case OP_CRQUERY:
2604	case OP_CRMINQUERY:
2605	case OP_CRPOSQUERY:
2606	ADD_ACTIVE(next_state_offset + `1`, `0`);
2607	if (isinclass)
2608	{
2609	if (*ecode == OP_CRPOSQUERY)
2610	{
2611	active_count--; / Remove non-match possibility /
2612	next_active_state--;
2613	}
2614	ADD_NEW(next_state_offset + `1`, `0`);
2615	}
2616	break;
2617
2618	case OP_CRRANGE:
2619	case OP_CRMINRANGE:
2620	case OP_CRPOSRANGE:
2621	count = current_state->count; / Already matched /
2622	if (count >= (int)GET2(ecode, `1`))
2623	{ ADD_ACTIVE(next_state_offset + `1` + `2` * IMM2_SIZE, `0`); }
2624	if (isinclass)
2625	{
2626	int max = (int)GET2(ecode, `1` + IMM2_SIZE);
2627	if (*ecode == OP_CRPOSRANGE)
2628	{
2629	active_count--; / Remove non-match possibility /
2630	next_active_state--;
2631	}
2632	if (++count >= max && max != `0`) / Max 0 => no limit /
2633	{ ADD_NEW(next_state_offset + `1` + `2` * IMM2_SIZE, `0`); }
2634	else
2635	{ ADD_NEW(state_offset, count); }
2636	}
2637	break;
2638
2639	default:
2640	if (isinclass) { ADD_NEW(next_state_offset, `0`); }
2641	break;
2642	}
2643	}
2644	break;
2645
2646	/ ========================================================================== /
2647	/ These are the opcodes for fancy brackets of various kinds. We have*
2648	to use recursion in order to handle them. The "always failing" assertion
2649	(?!) is optimised to OP_FAIL when compiling, so we have to support that,
2650	though the other "backtracking verbs" are not supported. /*
2651
2652	case OP_FAIL:
2653	forced_fail++; / Count FAILs for multiple states /
2654	break;
2655
2656	case OP_ASSERT:
2657	case OP_ASSERT_NOT:
2658	case OP_ASSERTBACK:
2659	case OP_ASSERTBACK_NOT:
2660	{
2661	int rc;
2662	int local_offsets[`2`];
2663	int local_workspace[`1000`];
2664	const pcre_uchar *endasscode = code + GET(code, `1`);
2665
2666	while (*endasscode == OP_ALT) endasscode += GET(endasscode, `1`);
2667
2668	rc = internal_dfa_exec(
2669	md, / static match data /
2670	code, / this subexpression's code /
2671	ptr, / where we currently are /
2672	(int)(ptr - start_subject), / start offset /
2673	local_offsets, / offset vector /
2674	sizeof(local_offsets)/sizeof(int), / size of same /
2675	local_workspace, / workspace vector /
2676	sizeof(local_workspace)/sizeof(int), / size of same /
2677	rlevel); / function recursion level /
2678
2679	if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2680	if ((rc >= `0`) == (codevalue == OP_ASSERT \|\| codevalue == OP_ASSERTBACK))
2681	{ ADD_ACTIVE((int)(endasscode + LINK_SIZE + `1` - start_code), `0`); }
2682	}
2683	break;
2684
2685	/-----------------------------------------------------------------/
2686	case OP_COND:
2687	case OP_SCOND:
2688	{
2689	int local_offsets[`1000`];
2690	int local_workspace[`1000`];
2691	int codelink = GET(code, `1`);
2692	int condcode;
2693
2694	/ Because of the way auto-callout works during compile, a callout item*
2695	is inserted between OP_COND and an assertion condition. This does not
2696	happen for the other conditions. /*
2697
2698	if (code[LINK_SIZE+`1`] == OP_CALLOUT)
2699	{
2700	rrc = `0`;
2701	if (PUBL(callout) != NULL)
2702	{
2703	PUBL(callout_block) cb;
2704	cb.version = `1`; / Version 1 of the callout block /
2705	cb.callout_number = code[LINK_SIZE+`2`];
2706	cb.offset_vector = offsets;
2707	#if defined COMPILE_PCRE8
2708	cb.subject = (PCRE_SPTR)start_subject;
2709	#elif defined COMPILE_PCRE16
2710	cb.subject = (PCRE_SPTR16)start_subject;
2711	#elif defined COMPILE_PCRE32
2712	cb.subject = (PCRE_SPTR32)start_subject;
2713	#endif
2714	cb.subject_length = (int)(end_subject - start_subject);
2715	cb.start_match = (int)(current_subject - start_subject);
2716	cb.current_position = (int)(ptr - start_subject);
2717	cb.pattern_position = GET(code, LINK_SIZE + `3`);
2718	cb.next_item_length = GET(code, `3` + `2`*LINK_SIZE);
2719	cb.capture_top = `1`;
2720	cb.capture_last = -`1`;
2721	cb.callout_data = md->callout_data;
2722	cb.mark = NULL; / No (MARK) support /*
2723	if ((rrc = (PUBL(callout))(&cb)) < `0`) return* rrc; / Abandon /
2724	}
2725	if (rrc > `0`) break; / Fail this thread /
2726	code += PRIV(OP_lengths)[OP_CALLOUT]; / Skip callout data /
2727	}
2728
2729	condcode = code[LINK_SIZE+`1`];
2730
2731	/ Back reference conditions and duplicate named recursion conditions*
2732	are not supported /*
2733
2734	if (condcode == OP_CREF \|\| condcode == OP_DNCREF \|\|
2735	condcode == OP_DNRREF)
2736	return PCRE_ERROR_DFA_UCOND;
2737
2738	/ The DEFINE condition is always false, and the assertion (?!) is*
2739	converted to OP_FAIL. /*
2740
2741	if (condcode == OP_DEF \|\| condcode == OP_FAIL)
2742	{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + `1`, `0`); }
2743
2744	/ The only supported version of OP_RREF is for the value RREF_ANY,*
2745	which means "test if in any recursion". We can't test for specifically
2746	recursed groups. /*
2747
2748	else if (condcode == OP_RREF)
2749	{
2750	int value = GET2(code, LINK_SIZE + `2`);
2751	if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2752	if (md->recursive != NULL)
2753	{ ADD_ACTIVE(state_offset + LINK_SIZE + `2` + IMM2_SIZE, `0`); }
2754	else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + `1`, `0`); }
2755	}
2756
2757	/ Otherwise, the condition is an assertion /
2758
2759	else
2760	{
2761	int rc;
2762	const pcre_uchar *asscode = code + LINK_SIZE + `1`;
2763	const pcre_uchar *endasscode = asscode + GET(asscode, `1`);
2764
2765	while (*endasscode == OP_ALT) endasscode += GET(endasscode, `1`);
2766
2767	rc = internal_dfa_exec(
2768	md, / fixed match data /
2769	asscode, / this subexpression's code /
2770	ptr, / where we currently are /
2771	(int)(ptr - start_subject), / start offset /
2772	local_offsets, / offset vector /
2773	sizeof(local_offsets)/sizeof(int), / size of same /
2774	local_workspace, / workspace vector /
2775	sizeof(local_workspace)/sizeof(int), / size of same /
2776	rlevel); / function recursion level /
2777
2778	if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2779	if ((rc >= `0`) ==
2780	(condcode == OP_ASSERT \|\| condcode == OP_ASSERTBACK))
2781	{ ADD_ACTIVE((int)(endasscode + LINK_SIZE + `1` - start_code), `0`); }
2782	else
2783	{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + `1`, `0`); }
2784	}
2785	}
2786	break;
2787
2788	/-----------------------------------------------------------------/
2789	case OP_RECURSE:
2790	{
2791	dfa_recursion_info *ri;
2792	int local_offsets[`1000`];
2793	int local_workspace[`1000`];
2794	const pcre_uchar *callpat = start_code + GET(code, `1`);
2795	int recno = (callpat == md->start_code)? `0` :
2796	GET2(callpat, `1` + LINK_SIZE);
2797	int rc;
2798
2799	DPRINTF(("%.sStarting regex recursion\n", rlevel`2`-`2`, SP));
2800
2801	/ Check for repeating a recursion without advancing the subject*
2802	pointer. This should catch convoluted mutual recursions. (Some simple
2803	cases are caught at compile time.) /*
2804
2805	for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2806	if (recno == ri->group_num && ptr == ri->subject_position)
2807	return PCRE_ERROR_RECURSELOOP;
2808
2809	/ Remember this recursion and where we started it so as to*
2810	catch infinite loops. /*
2811
2812	new_recursive.group_num = recno;
2813	new_recursive.subject_position = ptr;
2814	new_recursive.prevrec = md->recursive;
2815	md->recursive = &new_recursive;
2816
2817	rc = internal_dfa_exec(
2818	md, / fixed match data /
2819	callpat, / this subexpression's code /
2820	ptr, / where we currently are /
2821	(int)(ptr - start_subject), / start offset /
2822	local_offsets, / offset vector /
2823	sizeof(local_offsets)/sizeof(int), / size of same /
2824	local_workspace, / workspace vector /
2825	sizeof(local_workspace)/sizeof(int), / size of same /
2826	rlevel); / function recursion level /
2827
2828	md->recursive = new_recursive.prevrec; / Done this recursion /
2829
2830	DPRINTF(("%.sReturn from regex recursion: rc=%d\n", rlevel`2`-`2`, SP,
2831	rc));
2832
2833	/ Ran out of internal offsets /
2834
2835	if (rc == `0`) return PCRE_ERROR_DFA_RECURSE;
2836
2837	/ For each successful matched substring, set up the next state with a*
2838	count of characters to skip before trying it. Note that the count is in
2839	characters, not bytes. /*
2840
2841	if (rc > `0`)
2842	{
2843	for (rc = rc*`2` - `2`; rc >= `0`; rc -= `2`)
2844	{
2845	int charcount = local_offsets[rc+`1`] - local_offsets[rc];
2846	#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2847	if (utf)
2848	{
2849	const pcre_uchar *p = start_subject + local_offsets[rc];
2850	const pcre_uchar *pp = start_subject + local_offsets[rc+`1`];
2851	while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2852	}
2853	#endif
2854	if (charcount > `0`)
2855	{
2856	ADD_NEW_DATA(-(state_offset + LINK_SIZE + `1`), `0`, (charcount - `1`));
2857	}
2858	else
2859	{
2860	ADD_ACTIVE(state_offset + LINK_SIZE + `1`, `0`);
2861	}
2862	}
2863	}
2864	else if (rc != PCRE_ERROR_NOMATCH) return rc;
2865	}
2866	break;
2867
2868	/-----------------------------------------------------------------/
2869	case OP_BRAPOS:
2870	case OP_SBRAPOS:
2871	case OP_CBRAPOS:
2872	case OP_SCBRAPOS:
2873	case OP_BRAPOSZERO:
2874	{
2875	int charcount, matched_count;
2876	const pcre_uchar *local_ptr = ptr;
2877	BOOL allow_zero;
2878
2879	if (codevalue == OP_BRAPOSZERO)
2880	{
2881	allow_zero = TRUE;
2882	codevalue = (++code); /* Codevalue will be one of above BRAs /
2883	}
2884	else allow_zero = FALSE;
2885
2886	/ Loop to match the subpattern as many times as possible as if it were*
2887	a complete pattern. /*
2888
2889	for (matched_count = `0`;; matched_count++)
2890	{
2891	int local_offsets[`2`];
2892	int local_workspace[`1000`];
2893
2894	int rc = internal_dfa_exec(
2895	md, / fixed match data /
2896	code, / this subexpression's code /
2897	local_ptr, / where we currently are /
2898	(int)(ptr - start_subject), / start offset /
2899	local_offsets, / offset vector /
2900	sizeof(local_offsets)/sizeof(int), / size of same /
2901	local_workspace, / workspace vector /
2902	sizeof(local_workspace)/sizeof(int), / size of same /
2903	rlevel); / function recursion level /
2904
2905	/ Failed to match /
2906
2907	if (rc < `0`)
2908	{
2909	if (rc != PCRE_ERROR_NOMATCH) return rc;
2910	break;
2911	}
2912
2913	/ Matched: break the loop if zero characters matched. /
2914
2915	charcount = local_offsets[`1`] - local_offsets[`0`];
2916	if (charcount == `0`) break;
2917	local_ptr += charcount; / Advance temporary position ptr /
2918	}
2919
2920	/ At this point we have matched the subpattern matched_count*
2921	times, and local_ptr is pointing to the character after the end of the
2922	last match. /*
2923
2924	if (matched_count > `0` \|\| allow_zero)
2925	{
2926	const pcre_uchar *end_subpattern = code;
2927	int next_state_offset;
2928
2929	do { end_subpattern += GET(end_subpattern, `1`); }
2930	while (*end_subpattern == OP_ALT);
2931	next_state_offset =
2932	(int)(end_subpattern - start_code + LINK_SIZE + `1`);
2933
2934	/ Optimization: if there are no more active states, and there*
2935	are no new states yet set up, then skip over the subject string
2936	right here, to save looping. Otherwise, set up the new state to swing
2937	into action when the end of the matched substring is reached. /*
2938
2939	if (i + `1` >= active_count && new_count == `0`)
2940	{
2941	ptr = local_ptr;
2942	clen = `0`;
2943	ADD_NEW(next_state_offset, `0`);
2944	}
2945	else
2946	{
2947	const pcre_uchar *p = ptr;
2948	const pcre_uchar *pp = local_ptr;
2949	charcount = (int)(pp - p);
2950	#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2951	if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2952	#endif
2953	ADD_NEW_DATA(-next_state_offset, `0`, (charcount - `1`));
2954	}
2955	}
2956	}
2957	break;
2958
2959	/-----------------------------------------------------------------/
2960	case OP_ONCE:
2961	case OP_ONCE_NC:
2962	{
2963	int local_offsets[`2`];
2964	int local_workspace[`1000`];
2965
2966	int rc = internal_dfa_exec(
2967	md, / fixed match data /
2968	code, / this subexpression's code /
2969	ptr, / where we currently are /
2970	(int)(ptr - start_subject), / start offset /
2971	local_offsets, / offset vector /
2972	sizeof(local_offsets)/sizeof(int), / size of same /
2973	local_workspace, / workspace vector /
2974	sizeof(local_workspace)/sizeof(int), / size of same /
2975	rlevel); / function recursion level /
2976
2977	if (rc >= `0`)
2978	{
2979	const pcre_uchar *end_subpattern = code;
2980	int charcount = local_offsets[`1`] - local_offsets[`0`];
2981	int next_state_offset, repeat_state_offset;
2982
2983	do { end_subpattern += GET(end_subpattern, `1`); }
2984	while (*end_subpattern == OP_ALT);
2985	next_state_offset =
2986	(int)(end_subpattern - start_code + LINK_SIZE + `1`);
2987
2988	/ If the end of this subpattern is KETRMAX or KETRMIN, we must*
2989	arrange for the repeat state also to be added to the relevant list.
2990	Calculate the offset, or set -1 for no repeat. /*
2991
2992	repeat_state_offset = (*end_subpattern == OP_KETRMAX \|\|
2993	*end_subpattern == OP_KETRMIN)?
2994	(int)(end_subpattern - start_code - GET(end_subpattern, `1`)) : -`1`;
2995
2996	/ If we have matched an empty string, add the next state at the*
2997	current character pointer. This is important so that the duplicate
2998	checking kicks in, which is what breaks infinite loops that match an
2999	empty string. /*
3000
3001	if (charcount == `0`)
3002	{
3003	ADD_ACTIVE(next_state_offset, `0`);
3004	}
3005
3006	/ Optimization: if there are no more active states, and there*
3007	are no new states yet set up, then skip over the subject string
3008	right here, to save looping. Otherwise, set up the new state to swing
3009	into action when the end of the matched substring is reached. /*
3010
3011	else if (i + `1` >= active_count && new_count == `0`)
3012	{
3013	ptr += charcount;
3014	clen = `0`;
3015	ADD_NEW(next_state_offset, `0`);
3016
3017	/ If we are adding a repeat state at the new character position,*
3018	we must fudge things so that it is the only current state.
3019	Otherwise, it might be a duplicate of one we processed before, and
3020	that would cause it to be skipped. /*
3021
3022	if (repeat_state_offset >= `0`)
3023	{
3024	next_active_state = active_states;
3025	active_count = `0`;
3026	i = -`1`;
3027	ADD_ACTIVE(repeat_state_offset, `0`);
3028	}
3029	}
3030	else
3031	{
3032	#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3033	if (utf)
3034	{
3035	const pcre_uchar *p = start_subject + local_offsets[`0`];
3036	const pcre_uchar *pp = start_subject + local_offsets[`1`];
3037	while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
3038	}
3039	#endif
3040	ADD_NEW_DATA(-next_state_offset, `0`, (charcount - `1`));
3041	if (repeat_state_offset >= `0`)
3042	{ ADD_NEW_DATA(-repeat_state_offset, `0`, (charcount - `1`)); }
3043	}
3044	}
3045	else if (rc != PCRE_ERROR_NOMATCH) return rc;
3046	}
3047	break;
3048
3049
3050	/ ========================================================================== /
3051	/ Handle callouts /
3052
3053	case OP_CALLOUT:
3054	rrc = `0`;
3055	if (PUBL(callout) != NULL)
3056	{
3057	PUBL(callout_block) cb;
3058	cb.version = `1`; / Version 1 of the callout block /
3059	cb.callout_number = code[`1`];
3060	cb.offset_vector = offsets;
3061	#if defined COMPILE_PCRE8
3062	cb.subject = (PCRE_SPTR)start_subject;
3063	#elif defined COMPILE_PCRE16
3064	cb.subject = (PCRE_SPTR16)start_subject;
3065	#elif defined COMPILE_PCRE32
3066	cb.subject = (PCRE_SPTR32)start_subject;
3067	#endif
3068	cb.subject_length = (int)(end_subject - start_subject);
3069	cb.start_match = (int)(current_subject - start_subject);
3070	cb.current_position = (int)(ptr - start_subject);
3071	cb.pattern_position = GET(code, `2`);
3072	cb.next_item_length = GET(code, `2` + LINK_SIZE);
3073	cb.capture_top = `1`;
3074	cb.capture_last = -`1`;
3075	cb.callout_data = md->callout_data;
3076	cb.mark = NULL; / No (MARK) support /*
3077	if ((rrc = (PUBL(callout))(&cb)) < `0`) return* rrc; / Abandon /
3078	}
3079	if (rrc == `0`)
3080	{ ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], `0`); }
3081	break;
3082
3083
3084	/ ========================================================================== /
3085	default: / Unsupported opcode /
3086	return PCRE_ERROR_DFA_UITEM;
3087	}
3088
3089	NEXT_ACTIVE_STATE: continue;
3090
3091	} / End of loop scanning active states /
3092
3093	/ We have finished the processing at the current subject character. If no*
3094	new states have been set for the next character, we have found all the
3095	matches that we are going to find. If we are at the top level and partial
3096	matching has been requested, check for appropriate conditions.
3097
3098	The "forced_ fail" variable counts the number of (F) encountered for the*
3099	character. If it is equal to the original active_count (saved in
3100	workspace[1]) it means that (F) was found on every active state. In this*
3101	case we don't want to give a partial match.
3102
3103	The "could_continue" variable is true if a state could have continued but
3104	for the fact that the end of the subject was reached. /*
3105
3106	if (new_count <= `0`)
3107	{
3108	if (rlevel == `1` && / Top level, and /
3109	could_continue && / Some could go on, and /
3110	forced_fail != workspace[`1`] && / Not all forced fail & /
3111	( / either... /
3112	(md->moptions & PCRE_PARTIAL_HARD) != `0` / Hard partial /
3113	\|\| / or... /
3114	((md->moptions & PCRE_PARTIAL_SOFT) != `0` && / Soft partial and /
3115	match_count < `0`) / no matches /
3116	) && / And... /
3117	(
3118	partial_newline \|\| / Either partial NL /
3119	( / or ... /
3120	ptr >= end_subject && / End of subject and /
3121	ptr > md->start_used_ptr) / Inspected non-empty string /
3122	)
3123	)
3124	match_count = PCRE_ERROR_PARTIAL;
3125	DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3126	"%.s---------------------\n\n", rlevel`2`-`2`, SP, rlevel, match_count,
3127	rlevel*`2`-`2`, SP));
3128	break; / In effect, "return", but see the comment below /
3129	}
3130
3131	/ One or more states are active for the next character. /
3132
3133	ptr += clen; / Advance to next subject character /
3134	} / Loop to move along the subject string /
3135
3136	/ Control gets here from "break" a few lines above. We do it this way because*
3137	if we use "return" above, we have compiler trouble. Some compilers warn if
3138	there's nothing here because they think the function doesn't return a value. On
3139	the other hand, if we put a dummy statement here, some more clever compilers
3140	complain that it can't be reached. Sigh. /*
3141
3142	return match_count;
3143	}
3144
3145
3146
3147
3148	/*************************************************
3149	* Execute a Regular Expression - DFA engine *
3150	*************************************************/
3151
3152	/ This external function applies a compiled re to a subject string using a DFA*
3153	engine. This function calls the internal function multiple times if the pattern
3154	is not anchored.
3155
3156	Arguments:
3157	argument_re points to the compiled expression
3158	extra_data points to extra data or is NULL
3159	subject points to the subject string
3160	length length of subject string (may contain binary zeros)
3161	start_offset where to start in the subject string
3162	options option bits
3163	offsets vector of match offsets
3164	offsetcount size of same
3165	workspace workspace vector
3166	wscount size of same
3167
3168	Returns: > 0 => number of match offset pairs placed in offsets
3169	= 0 => offsets overflowed; longest matches are present
3170	-1 => failed to match
3171	< -1 => some kind of unexpected problem
3172	*/
3173
3174	#if defined COMPILE_PCRE8
3175	PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3176	pcre_dfa_exec(const pcre argument_re, const* pcre_extra *extra_data,
3177	const char subject, int* length, int start_offset, int options, int *offsets,
3178	int offsetcount, int workspace, int* wscount)
3179	#elif defined COMPILE_PCRE16
3180	PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3181	pcre16_dfa_exec(const pcre16 argument_re, const* pcre16_extra *extra_data,
3182	PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3183	int offsetcount, int workspace, int* wscount)
3184	#elif defined COMPILE_PCRE32
3185	PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3186	pcre32_dfa_exec(const pcre32 argument_re, const* pcre32_extra *extra_data,
3187	PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3188	int offsetcount, int workspace, int* wscount)
3189	#endif
3190	{
3191	REAL_PCRE re = (REAL_PCRE )argument_re;
3192	dfa_match_data match_block;
3193	dfa_match_data *md = &match_block;
3194	BOOL utf, anchored, startline, firstline;
3195	const pcre_uchar current_subject, end_subject;
3196	const pcre_study_data *study = NULL;
3197
3198	const pcre_uchar *req_char_ptr;
3199	const pcre_uint8 *start_bits = NULL;
3200	BOOL has_first_char = FALSE;
3201	BOOL has_req_char = FALSE;
3202	pcre_uchar first_char = `0`;
3203	pcre_uchar first_char2 = `0`;
3204	pcre_uchar req_char = `0`;
3205	pcre_uchar req_char2 = `0`;
3206	int newline;
3207
3208	/ Plausibility checks /
3209
3210	if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != `0`) return PCRE_ERROR_BADOPTION;
3211	if (re == NULL \|\| subject == NULL \|\| workspace == NULL \|\|
3212	(offsets == NULL && offsetcount > `0`)) return PCRE_ERROR_NULL;
3213	if (offsetcount < `0`) return PCRE_ERROR_BADCOUNT;
3214	if (wscount < `20`) return PCRE_ERROR_DFA_WSSIZE;
3215	if (length < `0`) return PCRE_ERROR_BADLENGTH;
3216	if (start_offset < `0` \|\| start_offset > length) return PCRE_ERROR_BADOFFSET;
3217
3218	/ Check that the first field in the block is the magic number. If it is not,*
3219	return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3220	REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3221	means that the pattern is likely compiled with different endianness. /*
3222
3223	if (re->magic_number != MAGIC_NUMBER)
3224	return re->magic_number == REVERSED_MAGIC_NUMBER?
3225	PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3226	if ((re->flags & PCRE_MODE) == `0`) return PCRE_ERROR_BADMODE;
3227
3228	/ If restarting after a partial match, do some sanity checks on the contents*
3229	of the workspace. /*
3230
3231	if ((options & PCRE_DFA_RESTART) != `0`)
3232	{
3233	if ((workspace[`0`] & (-`2`)) != `0` \|\| workspace[`1`] < `1` \|\|
3234	workspace[`1`] > (wscount - `2`)/INTS_PER_STATEBLOCK)
3235	return PCRE_ERROR_DFA_BADRESTART;
3236	}
3237
3238	/ Set up study, callout, and table data /
3239
3240	md->tables = re->tables;
3241	md->callout_data = NULL;
3242
3243	if (extra_data != NULL)
3244	{
3245	unsigned long int flags = extra_data->flags;
3246	if ((flags & PCRE_EXTRA_STUDY_DATA) != `0`)
3247	study = (const pcre_study_data *)extra_data->study_data;
3248	if ((flags & PCRE_EXTRA_MATCH_LIMIT) != `0`) return PCRE_ERROR_DFA_UMLIMIT;
3249	if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != `0`)
3250	return PCRE_ERROR_DFA_UMLIMIT;
3251	if ((flags & PCRE_EXTRA_CALLOUT_DATA) != `0`)
3252	md->callout_data = extra_data->callout_data;
3253	if ((flags & PCRE_EXTRA_TABLES) != `0`)
3254	md->tables = extra_data->tables;
3255	}
3256
3257	/ Set some local values /
3258
3259	current_subject = (const pcre_uchar *)subject + start_offset;
3260	end_subject = (const pcre_uchar *)subject + length;
3261	req_char_ptr = current_subject - `1`;
3262
3263	#ifdef SUPPORT_UTF
3264	/ PCRE_UTF(16\|32) have the same value as PCRE_UTF8. /
3265	utf = (re->options & PCRE_UTF8) != `0`;
3266	#else
3267	utf = FALSE;
3268	#endif
3269
3270	anchored = (options & (PCRE_ANCHORED\|PCRE_DFA_RESTART)) != `0` \|\|
3271	(re->options & PCRE_ANCHORED) != `0`;
3272
3273	/ The remaining fixed data for passing around. /
3274
3275	md->start_code = (const pcre_uchar *)argument_re +
3276	re->name_table_offset + re->name_count * re->name_entry_size;
3277	md->start_subject = (const pcre_uchar *)subject;
3278	md->end_subject = end_subject;
3279	md->start_offset = start_offset;
3280	md->moptions = options;
3281	md->poptions = re->options;
3282
3283	/ If the BSR option is not set at match time, copy what was set*
3284	at compile time. /*
3285
3286	if ((md->moptions & (PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE)) == `0`)
3287	{
3288	if ((re->options & (PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE)) != `0`)
3289	md->moptions \|= re->options & (PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE);
3290	#ifdef BSR_ANYCRLF
3291	else md->moptions \|= PCRE_BSR_ANYCRLF;
3292	#endif
3293	}
3294
3295	/ Handle different types of newline. The three bits give eight cases. If*
3296	nothing is set at run time, whatever was used at compile time applies. /*
3297
3298	switch ((((options & PCRE_NEWLINE_BITS) == `0`)? re->options : (pcre_uint32)options) &
3299	PCRE_NEWLINE_BITS)
3300	{
3301	case `0`: newline = NEWLINE; break; / Compile-time default /
3302	case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3303	case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3304	case PCRE_NEWLINE_CR+
3305	PCRE_NEWLINE_LF: newline = (CHAR_CR << `8`) \| CHAR_NL; break;
3306	case PCRE_NEWLINE_ANY: newline = -`1`; break;
3307	case PCRE_NEWLINE_ANYCRLF: newline = -`2`; break;
3308	default: return PCRE_ERROR_BADNEWLINE;
3309	}
3310
3311	if (newline == -`2`)
3312	{
3313	md->nltype = NLTYPE_ANYCRLF;
3314	}
3315	else if (newline < `0`)
3316	{
3317	md->nltype = NLTYPE_ANY;
3318	}
3319	else
3320	{
3321	md->nltype = NLTYPE_FIXED;
3322	if (newline > `255`)
3323	{
3324	md->nllen = `2`;
3325	md->nl[`0`] = (newline >> `8`) & `255`;
3326	md->nl[`1`] = newline & `255`;
3327	}
3328	else
3329	{
3330	md->nllen = `1`;
3331	md->nl[`0`] = newline;
3332	}
3333	}
3334
3335	/ Check a UTF-8 string if required. Unfortunately there's no way of passing*
3336	back the character offset. /*
3337
3338	#ifdef SUPPORT_UTF
3339	if (utf && (options & PCRE_NO_UTF8_CHECK) == `0`)
3340	{
3341	int erroroffset;
3342	int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3343	if (errorcode != `0`)
3344	{
3345	if (offsetcount >= `2`)
3346	{
3347	offsets[`0`] = erroroffset;
3348	offsets[`1`] = errorcode;
3349	}
3350	#if defined COMPILE_PCRE8
3351	return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != `0`) ?
3352	PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3353	#elif defined COMPILE_PCRE16
3354	return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != `0`) ?
3355	PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3356	#elif defined COMPILE_PCRE32
3357	return PCRE_ERROR_BADUTF32;
3358	#endif
3359	}
3360	#if defined COMPILE_PCRE8 \|\| defined COMPILE_PCRE16
3361	if (start_offset > `0` && start_offset < length &&
3362	NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3363	return PCRE_ERROR_BADUTF8_OFFSET;
3364	#endif
3365	}
3366	#endif
3367
3368	/ If the exec call supplied NULL for tables, use the inbuilt ones. This*
3369	is a feature that makes it possible to save compiled regex and re-use them
3370	in other programs later. /*
3371
3372	if (md->tables == NULL) md->tables = PRIV(default_tables);
3373
3374	/ The "must be at the start of a line" flags are used in a loop when finding*
3375	where to start. /*
3376
3377	startline = (re->flags & PCRE_STARTLINE) != `0`;
3378	firstline = (re->options & PCRE_FIRSTLINE) != `0`;
3379
3380	/ Set up the first character to match, if available. The first_byte value is*
3381	never set for an anchored regular expression, but the anchoring may be forced
3382	at run time, so we have to test for anchoring. The first char may be unset for
3383	an unanchored pattern, of course. If there's no first char and the pattern was
3384	studied, there may be a bitmap of possible first characters. /*
3385
3386	if (!anchored)
3387	{
3388	if ((re->flags & PCRE_FIRSTSET) != `0`)
3389	{
3390	has_first_char = TRUE;
3391	first_char = first_char2 = (pcre_uchar)(re->first_char);
3392	if ((re->flags & PCRE_FCH_CASELESS) != `0`)
3393	{
3394	first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3395	#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3396	if (utf && first_char > `127`)
3397	first_char2 = UCD_OTHERCASE(first_char);
3398	#endif
3399	}
3400	}
3401	else
3402	{
3403	if (!startline && study != NULL &&
3404	(study->flags & PCRE_STUDY_MAPPED) != `0`)
3405	start_bits = study->start_bits;
3406	}
3407	}
3408
3409	/ For anchored or unanchored matches, there may be a "last known required*
3410	character" set. /*
3411
3412	if ((re->flags & PCRE_REQCHSET) != `0`)
3413	{
3414	has_req_char = TRUE;
3415	req_char = req_char2 = (pcre_uchar)(re->req_char);
3416	if ((re->flags & PCRE_RCH_CASELESS) != `0`)
3417	{
3418	req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3419	#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3420	if (utf && req_char > `127`)
3421	req_char2 = UCD_OTHERCASE(req_char);
3422	#endif
3423	}
3424	}
3425
3426	/ Call the main matching function, looping for a non-anchored regex after a*
3427	failed match. If not restarting, perform certain optimizations at the start of
3428	a match. /*
3429
3430	for (;;)
3431	{
3432	int rc;
3433
3434	if ((options & PCRE_DFA_RESTART) == `0`)
3435	{
3436	const pcre_uchar *save_end_subject = end_subject;
3437
3438	/ If firstline is TRUE, the start of the match is constrained to the first*
3439	line of a multiline string. Implement this by temporarily adjusting
3440	end_subject so that we stop scanning at a newline. If the match fails at
3441	the newline, later code breaks this loop. /*
3442
3443	if (firstline)
3444	{
3445	PCRE_PUCHAR t = current_subject;
3446	#ifdef SUPPORT_UTF
3447	if (utf)
3448	{
3449	while (t < md->end_subject && !IS_NEWLINE(t))
3450	{
3451	t++;
3452	ACROSSCHAR(t < end_subject, *t, t++);
3453	}
3454	}
3455	else
3456	#endif
3457	while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3458	end_subject = t;
3459	}
3460
3461	/ There are some optimizations that avoid running the match if a known*
3462	starting point is not found. However, there is an option that disables
3463	these, for testing and for ensuring that all callouts do actually occur.
3464	The option can be set in the regex by (NO_START_OPT) or passed in*
3465	match-time options. /*
3466
3467	if (((options \| re->options) & PCRE_NO_START_OPTIMIZE) == `0`)
3468	{
3469	/ Advance to a known first pcre_uchar (i.e. data item) /
3470
3471	if (has_first_char)
3472	{
3473	if (first_char != first_char2)
3474	{
3475	pcre_uchar csc;
3476	while (current_subject < end_subject &&
3477	(csc = UCHAR21TEST(current_subject)) != first_char && csc != first_char2)
3478	current_subject++;
3479	}
3480	else
3481	while (current_subject < end_subject &&
3482	UCHAR21TEST(current_subject) != first_char)
3483	current_subject++;
3484	}
3485
3486	/ Or to just after a linebreak for a multiline match if possible /
3487
3488	else if (startline)
3489	{
3490	if (current_subject > md->start_subject + start_offset)
3491	{
3492	#ifdef SUPPORT_UTF
3493	if (utf)
3494	{
3495	while (current_subject < end_subject &&
3496	!WAS_NEWLINE(current_subject))
3497	{
3498	current_subject++;
3499	ACROSSCHAR(current_subject < end_subject, *current_subject,
3500	current_subject++);
3501	}
3502	}
3503	else
3504	#endif
3505	while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3506	current_subject++;
3507
3508	/ If we have just passed a CR and the newline option is ANY or*
3509	ANYCRLF, and we are now at a LF, advance the match position by one
3510	more character. /*
3511
3512	if (UCHAR21TEST(current_subject - `1`) == CHAR_CR &&
3513	(md->nltype == NLTYPE_ANY \|\| md->nltype == NLTYPE_ANYCRLF) &&
3514	current_subject < end_subject &&
3515	UCHAR21TEST(current_subject) == CHAR_NL)
3516	current_subject++;
3517	}
3518	}
3519
3520	/ Advance to a non-unique first pcre_uchar after study /
3521
3522	else if (start_bits != NULL)
3523	{
3524	while (current_subject < end_subject)
3525	{
3526	register pcre_uint32 c = UCHAR21TEST(current_subject);
3527	#ifndef COMPILE_PCRE8
3528	if (c > `255`) c = `255`;
3529	#endif
3530	if ((start_bits[c/`8`] & (`1` << (c&`7`))) != `0`) break;
3531	current_subject++;
3532	}
3533	}
3534	}
3535
3536	/ Restore fudged end_subject /
3537
3538	end_subject = save_end_subject;
3539
3540	/ The following two optimizations are disabled for partial matching or if*
3541	disabling is explicitly requested (and of course, by the test above, this
3542	code is not obeyed when restarting after a partial match). /*
3543
3544	if (((options \| re->options) & PCRE_NO_START_OPTIMIZE) == `0` &&
3545	(options & (PCRE_PARTIAL_HARD\|PCRE_PARTIAL_SOFT)) == `0`)
3546	{
3547	/ If the pattern was studied, a minimum subject length may be set. This*
3548	is a lower bound; no actual string of that length may actually match the
3549	pattern. Although the value is, strictly, in characters, we treat it as
3550	in pcre_uchar units to avoid spending too much time in this optimization.
3551	*/
3552
3553	if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != `0` &&
3554	(pcre_uint32)(end_subject - current_subject) < study->minlength)
3555	return PCRE_ERROR_NOMATCH;
3556
3557	/ If req_char is set, we know that that pcre_uchar must appear in the*
3558	subject for the match to succeed. If the first pcre_uchar is set,
3559	req_char must be later in the subject; otherwise the test starts at the
3560	match point. This optimization can save a huge amount of work in patterns
3561	with nested unlimited repeats that aren't going to match. Writing
3562	separate code for cased/caseless versions makes it go faster, as does
3563	using an autoincrement and backing off on a match.
3564
3565	HOWEVER: when the subject string is very, very long, searching to its end
3566	can take a long time, and give bad performance on quite ordinary
3567	patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3568	string... so we don't do this when the string is sufficiently long. /*
3569
3570	if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3571	{
3572	register PCRE_PUCHAR p = current_subject + (has_first_char? `1`:`0`);
3573
3574	/ We don't need to repeat the search if we haven't yet reached the*
3575	place we found it at last time. /*
3576
3577	if (p > req_char_ptr)
3578	{
3579	if (req_char != req_char2)
3580	{
3581	while (p < end_subject)
3582	{
3583	register pcre_uint32 pp = UCHAR21INCTEST(p);
3584	if (pp == req_char \|\| pp == req_char2) { p--; break; }
3585	}
3586	}
3587	else
3588	{
3589	while (p < end_subject)
3590	{
3591	if (UCHAR21INCTEST(p) == req_char) { p--; break; }
3592	}
3593	}
3594
3595	/ If we can't find the required pcre_uchar, break the matching loop,*
3596	which will cause a return or PCRE_ERROR_NOMATCH. /*
3597
3598	if (p >= end_subject) break;
3599
3600	/ If we have found the required pcre_uchar, save the point where we*
3601	found it, so that we don't search again next time round the loop if
3602	the start hasn't passed this point yet. /*
3603
3604	req_char_ptr = p;
3605	}
3606	}
3607	}
3608	} / End of optimizations that are done when not restarting /
3609
3610	/ OK, now we can do the business /
3611
3612	md->start_used_ptr = current_subject;
3613	md->recursive = NULL;
3614
3615	rc = internal_dfa_exec(
3616	md, / fixed match data /
3617	md->start_code, / this subexpression's code /
3618	current_subject, / where we currently are /
3619	start_offset, / start offset in subject /
3620	offsets, / offset vector /
3621	offsetcount, / size of same /
3622	workspace, / workspace vector /
3623	wscount, / size of same /
3624	`0`); / function recurse level /
3625
3626	/ Anything other than "no match" means we are done, always; otherwise, carry*
3627	on only if not anchored. /*
3628
3629	if (rc != PCRE_ERROR_NOMATCH \|\| anchored)
3630	{
3631	if (rc == PCRE_ERROR_PARTIAL && offsetcount >= `2`)
3632	{
3633	offsets[`0`] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject);
3634	offsets[`1`] = (int)(end_subject - (PCRE_PUCHAR)subject);
3635	if (offsetcount > `2`)
3636	offsets[`2`] = (int)(current_subject - (PCRE_PUCHAR)subject);
3637	}
3638	return rc;
3639	}
3640
3641	/ Advance to the next subject character unless we are at the end of a line*
3642	and firstline is set. /*
3643
3644	if (firstline && IS_NEWLINE(current_subject)) break;
3645	current_subject++;
3646	#ifdef SUPPORT_UTF
3647	if (utf)
3648	{
3649	ACROSSCHAR(current_subject < end_subject, *current_subject,
3650	current_subject++);
3651	}
3652	#endif
3653	if (current_subject > end_subject) break;
3654
3655	/ If we have just passed a CR and we are now at a LF, and the pattern does*
3656	not contain any explicit matches for \r or \n, and the newline option is CRLF
3657	or ANY or ANYCRLF, advance the match position by one more character. /*
3658
3659	if (UCHAR21TEST(current_subject - `1`) == CHAR_CR &&
3660	current_subject < end_subject &&
3661	UCHAR21TEST(current_subject) == CHAR_NL &&
3662	(re->flags & PCRE_HASCRORLF) == `0` &&
3663	(md->nltype == NLTYPE_ANY \|\|
3664	md->nltype == NLTYPE_ANYCRLF \|\|
3665	md->nllen == `2`))
3666	current_subject++;
3667
3668	} / "Bumpalong" loop /
3669
3670	return PCRE_ERROR_NOMATCH;
3671	}
3672
3673	/ End of pcre_dfa_exec.c /
3674

Browse the source code of ClickHouse/contrib/poco/Foundation/src/pcre_dfa_exec.c