1/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40#pragma warning( disable : 4244) // conversion from 'int' to 'unsigned short', possible loss of data
41
42/* This module contains the external function pcre_study(), along with local
43supporting functions. */
44
45#include "pcre_config.h"
46#include "pcre_internal.h"
47
48#define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
49
50/* Returns from set_start_bits() */
51
52enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
53
54
55
56/*************************************************
57* Find the minimum subject length for a group *
58*************************************************/
59
60/* Scan a parenthesized group and compute the minimum length of subject that
61is needed to match it. This is a lower bound; it does not mean there is a
62string of that length that matches. In UTF8 mode, the result is in characters
63rather than bytes.
64
65Arguments:
66 re compiled pattern block
67 code pointer to start of group (the bracket)
68 startcode pointer to start of the whole pattern's code
69 options the compiling options
70 recurses chain of recurse_check to catch mutual recursion
71 countptr pointer to call count (to catch over complexity)
72
73Returns: the minimum length
74 -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
75 -2 internal error (missing capturing bracket)
76 -3 internal error (opcode not listed)
77*/
78
79static int
80find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
81 const pcre_uchar *startcode, int options, recurse_check *recurses,
82 int *countptr)
83{
84int length = -1;
85/* PCRE_UTF16 has the same value as PCRE_UTF8. */
86BOOL utf = (options & PCRE_UTF8) != 0;
87BOOL had_recurse = FALSE;
88recurse_check this_recurse;
89register int branchlength = 0;
90register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
91
92if ((*countptr)++ > 1000) return -1; /* too complex */
93
94if (*code == OP_CBRA || *code == OP_SCBRA ||
95 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
96
97/* Scan along the opcodes for this branch. If we get to the end of the
98branch, check the length against that of the other branches. */
99
100for (;;)
101 {
102 int d, min;
103 pcre_uchar *cs, *ce;
104 register pcre_uchar op = *cc;
105
106 switch (op)
107 {
108 case OP_COND:
109 case OP_SCOND:
110
111 /* If there is only one branch in a condition, the implied branch has zero
112 length, so we don't add anything. This covers the DEFINE "condition"
113 automatically. */
114
115 cs = cc + GET(cc, 1);
116 if (*cs != OP_ALT)
117 {
118 cc = cs + 1 + LINK_SIZE;
119 break;
120 }
121
122 /* Otherwise we can fall through and treat it the same as any other
123 subpattern. */
124
125 case OP_CBRA:
126 case OP_SCBRA:
127 case OP_BRA:
128 case OP_SBRA:
129 case OP_CBRAPOS:
130 case OP_SCBRAPOS:
131 case OP_BRAPOS:
132 case OP_SBRAPOS:
133 case OP_ONCE:
134 case OP_ONCE_NC:
135 d = find_minlength(re, cc, startcode, options, recurses, countptr);
136 if (d < 0) return d;
137 branchlength += d;
138 do cc += GET(cc, 1); while (*cc == OP_ALT);
139 cc += 1 + LINK_SIZE;
140 break;
141
142 /* ACCEPT makes things far too complicated; we have to give up. */
143
144 case OP_ACCEPT:
145 case OP_ASSERT_ACCEPT:
146 return -1;
147
148 /* Reached end of a branch; if it's a ket it is the end of a nested
149 call. If it's ALT it is an alternation in a nested call. If it is END it's
150 the end of the outer call. All can be handled by the same code. If an
151 ACCEPT was previously encountered, use the length that was in force at that
152 time, and pass back the shortest ACCEPT length. */
153
154 case OP_ALT:
155 case OP_KET:
156 case OP_KETRMAX:
157 case OP_KETRMIN:
158 case OP_KETRPOS:
159 case OP_END:
160 if (length < 0 || (!had_recurse && branchlength < length))
161 length = branchlength;
162 if (op != OP_ALT) return length;
163 cc += 1 + LINK_SIZE;
164 branchlength = 0;
165 had_recurse = FALSE;
166 break;
167
168 /* Skip over assertive subpatterns */
169
170 case OP_ASSERT:
171 case OP_ASSERT_NOT:
172 case OP_ASSERTBACK:
173 case OP_ASSERTBACK_NOT:
174 do cc += GET(cc, 1); while (*cc == OP_ALT);
175 /* Fall through */
176
177 /* Skip over things that don't match chars */
178
179 case OP_REVERSE:
180 case OP_CREF:
181 case OP_DNCREF:
182 case OP_RREF:
183 case OP_DNRREF:
184 case OP_DEF:
185 case OP_CALLOUT:
186 case OP_SOD:
187 case OP_SOM:
188 case OP_EOD:
189 case OP_EODN:
190 case OP_CIRC:
191 case OP_CIRCM:
192 case OP_DOLL:
193 case OP_DOLLM:
194 case OP_NOT_WORD_BOUNDARY:
195 case OP_WORD_BOUNDARY:
196 cc += PRIV(OP_lengths)[*cc];
197 break;
198
199 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
200
201 case OP_BRAZERO:
202 case OP_BRAMINZERO:
203 case OP_BRAPOSZERO:
204 case OP_SKIPZERO:
205 cc += PRIV(OP_lengths)[*cc];
206 do cc += GET(cc, 1); while (*cc == OP_ALT);
207 cc += 1 + LINK_SIZE;
208 break;
209
210 /* Handle literal characters and + repetitions */
211
212 case OP_CHAR:
213 case OP_CHARI:
214 case OP_NOT:
215 case OP_NOTI:
216 case OP_PLUS:
217 case OP_PLUSI:
218 case OP_MINPLUS:
219 case OP_MINPLUSI:
220 case OP_POSPLUS:
221 case OP_POSPLUSI:
222 case OP_NOTPLUS:
223 case OP_NOTPLUSI:
224 case OP_NOTMINPLUS:
225 case OP_NOTMINPLUSI:
226 case OP_NOTPOSPLUS:
227 case OP_NOTPOSPLUSI:
228 branchlength++;
229 cc += 2;
230#ifdef SUPPORT_UTF
231 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
232#endif
233 break;
234
235 case OP_TYPEPLUS:
236 case OP_TYPEMINPLUS:
237 case OP_TYPEPOSPLUS:
238 branchlength++;
239 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
240 break;
241
242 /* Handle exact repetitions. The count is already in characters, but we
243 need to skip over a multibyte character in UTF8 mode. */
244
245 case OP_EXACT:
246 case OP_EXACTI:
247 case OP_NOTEXACT:
248 case OP_NOTEXACTI:
249 branchlength += GET2(cc,1);
250 cc += 2 + IMM2_SIZE;
251#ifdef SUPPORT_UTF
252 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
253#endif
254 break;
255
256 case OP_TYPEEXACT:
257 branchlength += GET2(cc,1);
258 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
259 || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
260 break;
261
262 /* Handle single-char non-literal matchers */
263
264 case OP_PROP:
265 case OP_NOTPROP:
266 cc += 2;
267 /* Fall through */
268
269 case OP_NOT_DIGIT:
270 case OP_DIGIT:
271 case OP_NOT_WHITESPACE:
272 case OP_WHITESPACE:
273 case OP_NOT_WORDCHAR:
274 case OP_WORDCHAR:
275 case OP_ANY:
276 case OP_ALLANY:
277 case OP_EXTUNI:
278 case OP_HSPACE:
279 case OP_NOT_HSPACE:
280 case OP_VSPACE:
281 case OP_NOT_VSPACE:
282 branchlength++;
283 cc++;
284 break;
285
286 /* "Any newline" might match two characters, but it also might match just
287 one. */
288
289 case OP_ANYNL:
290 branchlength += 1;
291 cc++;
292 break;
293
294 /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
295 non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
296 appear, but leave the code, just in case.) */
297
298 case OP_ANYBYTE:
299#ifdef SUPPORT_UTF
300 if (utf) return -1;
301#endif
302 branchlength++;
303 cc++;
304 break;
305
306 /* For repeated character types, we have to test for \p and \P, which have
307 an extra two bytes of parameters. */
308
309 case OP_TYPESTAR:
310 case OP_TYPEMINSTAR:
311 case OP_TYPEQUERY:
312 case OP_TYPEMINQUERY:
313 case OP_TYPEPOSSTAR:
314 case OP_TYPEPOSQUERY:
315 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
316 cc += PRIV(OP_lengths)[op];
317 break;
318
319 case OP_TYPEUPTO:
320 case OP_TYPEMINUPTO:
321 case OP_TYPEPOSUPTO:
322 if (cc[1 + IMM2_SIZE] == OP_PROP
323 || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
324 cc += PRIV(OP_lengths)[op];
325 break;
326
327 /* Check a class for variable quantification */
328
329 case OP_CLASS:
330 case OP_NCLASS:
331#if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
332 case OP_XCLASS:
333 /* The original code caused an unsigned overflow in 64 bit systems,
334 so now we use a conditional statement. */
335 if (op == OP_XCLASS)
336 cc += GET(cc, 1);
337 else
338 cc += PRIV(OP_lengths)[OP_CLASS];
339#else
340 cc += PRIV(OP_lengths)[OP_CLASS];
341#endif
342
343 switch (*cc)
344 {
345 case OP_CRPLUS:
346 case OP_CRMINPLUS:
347 case OP_CRPOSPLUS:
348 branchlength++;
349 /* Fall through */
350
351 case OP_CRSTAR:
352 case OP_CRMINSTAR:
353 case OP_CRQUERY:
354 case OP_CRMINQUERY:
355 case OP_CRPOSSTAR:
356 case OP_CRPOSQUERY:
357 cc++;
358 break;
359
360 case OP_CRRANGE:
361 case OP_CRMINRANGE:
362 case OP_CRPOSRANGE:
363 branchlength += GET2(cc,1);
364 cc += 1 + 2 * IMM2_SIZE;
365 break;
366
367 default:
368 branchlength++;
369 break;
370 }
371 break;
372
373 /* Backreferences and subroutine calls are treated in the same way: we find
374 the minimum length for the subpattern. A recursion, however, causes an
375 a flag to be set that causes the length of this branch to be ignored. The
376 logic is that a recursion can only make sense if there is another
377 alternation that stops the recursing. That will provide the minimum length
378 (when no recursion happens). A backreference within the group that it is
379 referencing behaves in the same way.
380
381 If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
382 matches an empty string (by default it causes a matching failure), so in
383 that case we must set the minimum length to zero. */
384
385 case OP_DNREF: /* Duplicate named pattern back reference */
386 case OP_DNREFI:
387 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
388 {
389 int count = GET2(cc, 1+IMM2_SIZE);
390 pcre_uchar *slot = (pcre_uchar *)re +
391 re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
392 d = INT_MAX;
393 while (count-- > 0)
394 {
395 ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
396 if (cs == NULL) return -2;
397 do ce += GET(ce, 1); while (*ce == OP_ALT);
398 if (cc > cs && cc < ce) /* Simple recursion */
399 {
400 d = 0;
401 had_recurse = TRUE;
402 break;
403 }
404 else
405 {
406 recurse_check *r = recurses;
407 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
408 if (r != NULL) /* Mutual recursion */
409 {
410 d = 0;
411 had_recurse = TRUE;
412 break;
413 }
414 else
415 {
416 int dd;
417 this_recurse.prev = recurses;
418 this_recurse.group = cs;
419 dd = find_minlength(re, cs, startcode, options, &this_recurse,
420 countptr);
421 if (dd < d) d = dd;
422 }
423 }
424 slot += re->name_entry_size;
425 }
426 }
427 else d = 0;
428 cc += 1 + 2*IMM2_SIZE;
429 goto REPEAT_BACK_REFERENCE;
430
431 case OP_REF: /* Single back reference */
432 case OP_REFI:
433 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
434 {
435 ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
436 if (cs == NULL) return -2;
437 do ce += GET(ce, 1); while (*ce == OP_ALT);
438 if (cc > cs && cc < ce) /* Simple recursion */
439 {
440 d = 0;
441 had_recurse = TRUE;
442 }
443 else
444 {
445 recurse_check *r = recurses;
446 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
447 if (r != NULL) /* Mutual recursion */
448 {
449 d = 0;
450 had_recurse = TRUE;
451 }
452 else
453 {
454 this_recurse.prev = recurses;
455 this_recurse.group = cs;
456 d = find_minlength(re, cs, startcode, options, &this_recurse,
457 countptr);
458 }
459 }
460 }
461 else d = 0;
462 cc += 1 + IMM2_SIZE;
463
464 /* Handle repeated back references */
465
466 REPEAT_BACK_REFERENCE:
467 switch (*cc)
468 {
469 case OP_CRSTAR:
470 case OP_CRMINSTAR:
471 case OP_CRQUERY:
472 case OP_CRMINQUERY:
473 case OP_CRPOSSTAR:
474 case OP_CRPOSQUERY:
475 min = 0;
476 cc++;
477 break;
478
479 case OP_CRPLUS:
480 case OP_CRMINPLUS:
481 case OP_CRPOSPLUS:
482 min = 1;
483 cc++;
484 break;
485
486 case OP_CRRANGE:
487 case OP_CRMINRANGE:
488 case OP_CRPOSRANGE:
489 min = GET2(cc, 1);
490 cc += 1 + 2 * IMM2_SIZE;
491 break;
492
493 default:
494 min = 1;
495 break;
496 }
497
498 branchlength += min * d;
499 break;
500
501 /* We can easily detect direct recursion, but not mutual recursion. This is
502 caught by a recursion depth count. */
503
504 case OP_RECURSE:
505 cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
506 do ce += GET(ce, 1); while (*ce == OP_ALT);
507 if (cc > cs && cc < ce) /* Simple recursion */
508 had_recurse = TRUE;
509 else
510 {
511 recurse_check *r = recurses;
512 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
513 if (r != NULL) /* Mutual recursion */
514 had_recurse = TRUE;
515 else
516 {
517 this_recurse.prev = recurses;
518 this_recurse.group = cs;
519 branchlength += find_minlength(re, cs, startcode, options,
520 &this_recurse, countptr);
521 }
522 }
523 cc += 1 + LINK_SIZE;
524 break;
525
526 /* Anything else does not or need not match a character. We can get the
527 item's length from the table, but for those that can match zero occurrences
528 of a character, we must take special action for UTF-8 characters. As it
529 happens, the "NOT" versions of these opcodes are used at present only for
530 ASCII characters, so they could be omitted from this list. However, in
531 future that may change, so we include them here so as not to leave a
532 gotcha for a future maintainer. */
533
534 case OP_UPTO:
535 case OP_UPTOI:
536 case OP_NOTUPTO:
537 case OP_NOTUPTOI:
538 case OP_MINUPTO:
539 case OP_MINUPTOI:
540 case OP_NOTMINUPTO:
541 case OP_NOTMINUPTOI:
542 case OP_POSUPTO:
543 case OP_POSUPTOI:
544 case OP_NOTPOSUPTO:
545 case OP_NOTPOSUPTOI:
546
547 case OP_STAR:
548 case OP_STARI:
549 case OP_NOTSTAR:
550 case OP_NOTSTARI:
551 case OP_MINSTAR:
552 case OP_MINSTARI:
553 case OP_NOTMINSTAR:
554 case OP_NOTMINSTARI:
555 case OP_POSSTAR:
556 case OP_POSSTARI:
557 case OP_NOTPOSSTAR:
558 case OP_NOTPOSSTARI:
559
560 case OP_QUERY:
561 case OP_QUERYI:
562 case OP_NOTQUERY:
563 case OP_NOTQUERYI:
564 case OP_MINQUERY:
565 case OP_MINQUERYI:
566 case OP_NOTMINQUERY:
567 case OP_NOTMINQUERYI:
568 case OP_POSQUERY:
569 case OP_POSQUERYI:
570 case OP_NOTPOSQUERY:
571 case OP_NOTPOSQUERYI:
572
573 cc += PRIV(OP_lengths)[op];
574#ifdef SUPPORT_UTF
575 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
576#endif
577 break;
578
579 /* Skip these, but we need to add in the name length. */
580
581 case OP_MARK:
582 case OP_PRUNE_ARG:
583 case OP_SKIP_ARG:
584 case OP_THEN_ARG:
585 cc += PRIV(OP_lengths)[op] + cc[1];
586 break;
587
588 /* The remaining opcodes are just skipped over. */
589
590 case OP_CLOSE:
591 case OP_COMMIT:
592 case OP_FAIL:
593 case OP_PRUNE:
594 case OP_SET_SOM:
595 case OP_SKIP:
596 case OP_THEN:
597 cc += PRIV(OP_lengths)[op];
598 break;
599
600 /* This should not occur: we list all opcodes explicitly so that when
601 new ones get added they are properly considered. */
602
603 default:
604 return -3;
605 }
606 }
607/* Control never gets here */
608}
609
610
611
612/*************************************************
613* Set a bit and maybe its alternate case *
614*************************************************/
615
616/* Given a character, set its first byte's bit in the table, and also the
617corresponding bit for the other version of a letter if we are caseless. In
618UTF-8 mode, for characters greater than 127, we can only do the caseless thing
619when Unicode property support is available.
620
621Arguments:
622 start_bits points to the bit map
623 p points to the character
624 caseless the caseless flag
625 cd the block with char table pointers
626 utf TRUE for UTF-8 / UTF-16 / UTF-32 mode
627
628Returns: pointer after the character
629*/
630
631static const pcre_uchar *
632set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
633 compile_data *cd, BOOL utf)
634{
635pcre_uint32 c = *p;
636
637#ifdef COMPILE_PCRE8
638SET_BIT(c);
639
640#ifdef SUPPORT_UTF
641if (utf && c > 127)
642 {
643 GETCHARINC(c, p);
644#ifdef SUPPORT_UCP
645 if (caseless)
646 {
647 pcre_uchar buff[6];
648 c = UCD_OTHERCASE(c);
649 (void)PRIV(ord2utf)(c, buff);
650 SET_BIT(buff[0]);
651 }
652#endif /* Not SUPPORT_UCP */
653 return p;
654 }
655#else /* Not SUPPORT_UTF */
656(void)(utf); /* Stops warning for unused parameter */
657#endif /* SUPPORT_UTF */
658
659/* Not UTF-8 mode, or character is less than 127. */
660
661if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
662return p + 1;
663#endif /* COMPILE_PCRE8 */
664
665#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
666if (c > 0xff)
667 {
668 c = 0xff;
669 caseless = FALSE;
670 }
671SET_BIT(c);
672
673#ifdef SUPPORT_UTF
674if (utf && c > 127)
675 {
676 GETCHARINC(c, p);
677#ifdef SUPPORT_UCP
678 if (caseless)
679 {
680 c = UCD_OTHERCASE(c);
681 if (c > 0xff)
682 c = 0xff;
683 SET_BIT(c);
684 }
685#endif /* SUPPORT_UCP */
686 return p;
687 }
688#else /* Not SUPPORT_UTF */
689(void)(utf); /* Stops warning for unused parameter */
690#endif /* SUPPORT_UTF */
691
692if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
693return p + 1;
694#endif
695}
696
697
698
699/*************************************************
700* Set bits for a positive character type *
701*************************************************/
702
703/* This function sets starting bits for a character type. In UTF-8 mode, we can
704only do a direct setting for bytes less than 128, as otherwise there can be
705confusion with bytes in the middle of UTF-8 characters. In a "traditional"
706environment, the tables will only recognize ASCII characters anyway, but in at
707least one Windows environment, some higher bytes bits were set in the tables.
708So we deal with that case by considering the UTF-8 encoding.
709
710Arguments:
711 start_bits the starting bitmap
712 cbit type the type of character wanted
713 table_limit 32 for non-UTF-8; 16 for UTF-8
714 cd the block with char table pointers
715
716Returns: nothing
717*/
718
719static void
720set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
721 compile_data *cd)
722{
723register pcre_uint32 c;
724for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
725#if defined SUPPORT_UTF && defined COMPILE_PCRE8
726if (table_limit == 32) return;
727for (c = 128; c < 256; c++)
728 {
729 if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
730 {
731 pcre_uchar buff[6];
732 (void)PRIV(ord2utf)(c, buff);
733 SET_BIT(buff[0]);
734 }
735 }
736#endif
737}
738
739
740/*************************************************
741* Set bits for a negative character type *
742*************************************************/
743
744/* This function sets starting bits for a negative character type such as \D.
745In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
746otherwise there can be confusion with bytes in the middle of UTF-8 characters.
747Unlike in the positive case, where we can set appropriate starting bits for
748specific high-valued UTF-8 characters, in this case we have to set the bits for
749all high-valued characters. The lowest is 0xc2, but we overkill by starting at
7500xc0 (192) for simplicity.
751
752Arguments:
753 start_bits the starting bitmap
754 cbit type the type of character wanted
755 table_limit 32 for non-UTF-8; 16 for UTF-8
756 cd the block with char table pointers
757
758Returns: nothing
759*/
760
761static void
762set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
763 compile_data *cd)
764{
765register pcre_uint32 c;
766for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
767#if defined SUPPORT_UTF && defined COMPILE_PCRE8
768if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
769#endif
770}
771
772
773
774/*************************************************
775* Create bitmap of starting bytes *
776*************************************************/
777
778/* This function scans a compiled unanchored expression recursively and
779attempts to build a bitmap of the set of possible starting bytes. As time goes
780by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
781useful for parenthesized groups in patterns such as (a*)b where the group
782provides some optional starting bytes but scanning must continue at the outer
783level to find at least one mandatory byte. At the outermost level, this
784function fails unless the result is SSB_DONE.
785
786Arguments:
787 code points to an expression
788 start_bits points to a 32-byte table, initialized to 0
789 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
790 cd the block with char table pointers
791
792Returns: SSB_FAIL => Failed to find any starting bytes
793 SSB_DONE => Found mandatory starting bytes
794 SSB_CONTINUE => Found optional starting bytes
795 SSB_UNKNOWN => Hit an unrecognized opcode
796*/
797
798static int
799set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
800 compile_data *cd)
801{
802register pcre_uint32 c;
803int yield = SSB_DONE;
804#if defined SUPPORT_UTF && defined COMPILE_PCRE8
805int table_limit = utf? 16:32;
806#else
807int table_limit = 32;
808#endif
809
810#if 0
811/* ========================================================================= */
812/* The following comment and code was inserted in January 1999. In May 2006,
813when it was observed to cause compiler warnings about unused values, I took it
814out again. If anybody is still using OS/2, they will have to put it back
815manually. */
816
817/* This next statement and the later reference to dummy are here in order to
818trick the optimizer of the IBM C compiler for OS/2 into generating correct
819code. Apparently IBM isn't going to fix the problem, and we would rather not
820disable optimization (in this module it actually makes a big difference, and
821the pcre module can use all the optimization it can get). */
822
823volatile int dummy;
824/* ========================================================================= */
825#endif
826
827do
828 {
829 BOOL try_next = TRUE;
830 const pcre_uchar *tcode = code + 1 + LINK_SIZE;
831
832 if (*code == OP_CBRA || *code == OP_SCBRA ||
833 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
834
835 while (try_next) /* Loop for items in this branch */
836 {
837 int rc;
838
839 switch(*tcode)
840 {
841 /* If we reach something we don't understand, it means a new opcode has
842 been created that hasn't been added to this code. Hopefully this problem
843 will be discovered during testing. */
844
845 default:
846 return SSB_UNKNOWN;
847
848 /* Fail for a valid opcode that implies no starting bits. */
849
850 case OP_ACCEPT:
851 case OP_ASSERT_ACCEPT:
852 case OP_ALLANY:
853 case OP_ANY:
854 case OP_ANYBYTE:
855 case OP_CIRC:
856 case OP_CIRCM:
857 case OP_CLOSE:
858 case OP_COMMIT:
859 case OP_COND:
860 case OP_CREF:
861 case OP_DEF:
862 case OP_DNCREF:
863 case OP_DNREF:
864 case OP_DNREFI:
865 case OP_DNRREF:
866 case OP_DOLL:
867 case OP_DOLLM:
868 case OP_END:
869 case OP_EOD:
870 case OP_EODN:
871 case OP_EXTUNI:
872 case OP_FAIL:
873 case OP_MARK:
874 case OP_NOT:
875 case OP_NOTEXACT:
876 case OP_NOTEXACTI:
877 case OP_NOTI:
878 case OP_NOTMINPLUS:
879 case OP_NOTMINPLUSI:
880 case OP_NOTMINQUERY:
881 case OP_NOTMINQUERYI:
882 case OP_NOTMINSTAR:
883 case OP_NOTMINSTARI:
884 case OP_NOTMINUPTO:
885 case OP_NOTMINUPTOI:
886 case OP_NOTPLUS:
887 case OP_NOTPLUSI:
888 case OP_NOTPOSPLUS:
889 case OP_NOTPOSPLUSI:
890 case OP_NOTPOSQUERY:
891 case OP_NOTPOSQUERYI:
892 case OP_NOTPOSSTAR:
893 case OP_NOTPOSSTARI:
894 case OP_NOTPOSUPTO:
895 case OP_NOTPOSUPTOI:
896 case OP_NOTPROP:
897 case OP_NOTQUERY:
898 case OP_NOTQUERYI:
899 case OP_NOTSTAR:
900 case OP_NOTSTARI:
901 case OP_NOTUPTO:
902 case OP_NOTUPTOI:
903 case OP_NOT_HSPACE:
904 case OP_NOT_VSPACE:
905 case OP_PRUNE:
906 case OP_PRUNE_ARG:
907 case OP_RECURSE:
908 case OP_REF:
909 case OP_REFI:
910 case OP_REVERSE:
911 case OP_RREF:
912 case OP_SCOND:
913 case OP_SET_SOM:
914 case OP_SKIP:
915 case OP_SKIP_ARG:
916 case OP_SOD:
917 case OP_SOM:
918 case OP_THEN:
919 case OP_THEN_ARG:
920 return SSB_FAIL;
921
922 /* A "real" property test implies no starting bits, but the fake property
923 PT_CLIST identifies a list of characters. These lists are short, as they
924 are used for characters with more than one "other case", so there is no
925 point in recognizing them for OP_NOTPROP. */
926
927 case OP_PROP:
928 if (tcode[1] != PT_CLIST) return SSB_FAIL;
929 {
930 const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[2];
931 while ((c = *p++) < NOTACHAR)
932 {
933#if defined SUPPORT_UTF && defined COMPILE_PCRE8
934 if (utf)
935 {
936 pcre_uchar buff[6];
937 (void)PRIV(ord2utf)(c, buff);
938 c = buff[0];
939 }
940#endif
941 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
942 }
943 }
944 try_next = FALSE;
945 break;
946
947 /* We can ignore word boundary tests. */
948
949 case OP_WORD_BOUNDARY:
950 case OP_NOT_WORD_BOUNDARY:
951 tcode++;
952 break;
953
954 /* If we hit a bracket or a positive lookahead assertion, recurse to set
955 bits from within the subpattern. If it can't find anything, we have to
956 give up. If it finds some mandatory character(s), we are done for this
957 branch. Otherwise, carry on scanning after the subpattern. */
958
959 case OP_BRA:
960 case OP_SBRA:
961 case OP_CBRA:
962 case OP_SCBRA:
963 case OP_BRAPOS:
964 case OP_SBRAPOS:
965 case OP_CBRAPOS:
966 case OP_SCBRAPOS:
967 case OP_ONCE:
968 case OP_ONCE_NC:
969 case OP_ASSERT:
970 rc = set_start_bits(tcode, start_bits, utf, cd);
971 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
972 if (rc == SSB_DONE) try_next = FALSE; else
973 {
974 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
975 tcode += 1 + LINK_SIZE;
976 }
977 break;
978
979 /* If we hit ALT or KET, it means we haven't found anything mandatory in
980 this branch, though we might have found something optional. For ALT, we
981 continue with the next alternative, but we have to arrange that the final
982 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
983 return SSB_CONTINUE: if this is the top level, that indicates failure,
984 but after a nested subpattern, it causes scanning to continue. */
985
986 case OP_ALT:
987 yield = SSB_CONTINUE;
988 try_next = FALSE;
989 break;
990
991 case OP_KET:
992 case OP_KETRMAX:
993 case OP_KETRMIN:
994 case OP_KETRPOS:
995 return SSB_CONTINUE;
996
997 /* Skip over callout */
998
999 case OP_CALLOUT:
1000 tcode += 2 + 2*LINK_SIZE;
1001 break;
1002
1003 /* Skip over lookbehind and negative lookahead assertions */
1004
1005 case OP_ASSERT_NOT:
1006 case OP_ASSERTBACK:
1007 case OP_ASSERTBACK_NOT:
1008 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1009 tcode += 1 + LINK_SIZE;
1010 break;
1011
1012 /* BRAZERO does the bracket, but carries on. */
1013
1014 case OP_BRAZERO:
1015 case OP_BRAMINZERO:
1016 case OP_BRAPOSZERO:
1017 rc = set_start_bits(++tcode, start_bits, utf, cd);
1018 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1019/* =========================================================================
1020 See the comment at the head of this function concerning the next line,
1021 which was an old fudge for the benefit of OS/2.
1022 dummy = 1;
1023 ========================================================================= */
1024 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1025 tcode += 1 + LINK_SIZE;
1026 break;
1027
1028 /* SKIPZERO skips the bracket. */
1029
1030 case OP_SKIPZERO:
1031 tcode++;
1032 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1033 tcode += 1 + LINK_SIZE;
1034 break;
1035
1036 /* Single-char * or ? sets the bit and tries the next item */
1037
1038 case OP_STAR:
1039 case OP_MINSTAR:
1040 case OP_POSSTAR:
1041 case OP_QUERY:
1042 case OP_MINQUERY:
1043 case OP_POSQUERY:
1044 tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1045 break;
1046
1047 case OP_STARI:
1048 case OP_MINSTARI:
1049 case OP_POSSTARI:
1050 case OP_QUERYI:
1051 case OP_MINQUERYI:
1052 case OP_POSQUERYI:
1053 tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1054 break;
1055
1056 /* Single-char upto sets the bit and tries the next */
1057
1058 case OP_UPTO:
1059 case OP_MINUPTO:
1060 case OP_POSUPTO:
1061 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
1062 break;
1063
1064 case OP_UPTOI:
1065 case OP_MINUPTOI:
1066 case OP_POSUPTOI:
1067 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1068 break;
1069
1070 /* At least one single char sets the bit and stops */
1071
1072 case OP_EXACT:
1073 tcode += IMM2_SIZE;
1074 /* Fall through */
1075 case OP_CHAR:
1076 case OP_PLUS:
1077 case OP_MINPLUS:
1078 case OP_POSPLUS:
1079 (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1080 try_next = FALSE;
1081 break;
1082
1083 case OP_EXACTI:
1084 tcode += IMM2_SIZE;
1085 /* Fall through */
1086 case OP_CHARI:
1087 case OP_PLUSI:
1088 case OP_MINPLUSI:
1089 case OP_POSPLUSI:
1090 (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1091 try_next = FALSE;
1092 break;
1093
1094 /* Special spacing and line-terminating items. These recognize specific
1095 lists of characters. The difference between VSPACE and ANYNL is that the
1096 latter can match the two-character CRLF sequence, but that is not
1097 relevant for finding the first character, so their code here is
1098 identical. */
1099
1100 case OP_HSPACE:
1101 SET_BIT(CHAR_HT);
1102 SET_BIT(CHAR_SPACE);
1103#ifdef SUPPORT_UTF
1104 if (utf)
1105 {
1106#ifdef COMPILE_PCRE8
1107 SET_BIT(0xC2); /* For U+00A0 */
1108 SET_BIT(0xE1); /* For U+1680, U+180E */
1109 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1110 SET_BIT(0xE3); /* For U+3000 */
1111#elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1112 SET_BIT(0xA0);
1113 SET_BIT(0xFF); /* For characters > 255 */
1114#endif /* COMPILE_PCRE[8|16|32] */
1115 }
1116 else
1117#endif /* SUPPORT_UTF */
1118 {
1119#ifndef EBCDIC
1120 SET_BIT(0xA0);
1121#endif /* Not EBCDIC */
1122#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1123 SET_BIT(0xFF); /* For characters > 255 */
1124#endif /* COMPILE_PCRE[16|32] */
1125 }
1126 try_next = FALSE;
1127 break;
1128
1129 case OP_ANYNL:
1130 case OP_VSPACE:
1131 SET_BIT(CHAR_LF);
1132 SET_BIT(CHAR_VT);
1133 SET_BIT(CHAR_FF);
1134 SET_BIT(CHAR_CR);
1135#ifdef SUPPORT_UTF
1136 if (utf)
1137 {
1138#ifdef COMPILE_PCRE8
1139 SET_BIT(0xC2); /* For U+0085 */
1140 SET_BIT(0xE2); /* For U+2028, U+2029 */
1141#elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1142 SET_BIT(CHAR_NEL);
1143 SET_BIT(0xFF); /* For characters > 255 */
1144#endif /* COMPILE_PCRE[8|16|32] */
1145 }
1146 else
1147#endif /* SUPPORT_UTF */
1148 {
1149 SET_BIT(CHAR_NEL);
1150#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1151 SET_BIT(0xFF); /* For characters > 255 */
1152#endif
1153 }
1154 try_next = FALSE;
1155 break;
1156
1157 /* Single character types set the bits and stop. Note that if PCRE_UCP
1158 is set, we do not see these op codes because \d etc are converted to
1159 properties. Therefore, these apply in the case when only characters less
1160 than 256 are recognized to match the types. */
1161
1162 case OP_NOT_DIGIT:
1163 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1164 try_next = FALSE;
1165 break;
1166
1167 case OP_DIGIT:
1168 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1169 try_next = FALSE;
1170 break;
1171
1172 /* The cbit_space table has vertical tab as whitespace; we no longer
1173 have to play fancy tricks because Perl added VT to its whitespace at
1174 release 5.18. PCRE added it at release 8.34. */
1175
1176 case OP_NOT_WHITESPACE:
1177 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1178 try_next = FALSE;
1179 break;
1180
1181 case OP_WHITESPACE:
1182 set_type_bits(start_bits, cbit_space, table_limit, cd);
1183 try_next = FALSE;
1184 break;
1185
1186 case OP_NOT_WORDCHAR:
1187 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1188 try_next = FALSE;
1189 break;
1190
1191 case OP_WORDCHAR:
1192 set_type_bits(start_bits, cbit_word, table_limit, cd);
1193 try_next = FALSE;
1194 break;
1195
1196 /* One or more character type fudges the pointer and restarts, knowing
1197 it will hit a single character type and stop there. */
1198
1199 case OP_TYPEPLUS:
1200 case OP_TYPEMINPLUS:
1201 case OP_TYPEPOSPLUS:
1202 tcode++;
1203 break;
1204
1205 case OP_TYPEEXACT:
1206 tcode += 1 + IMM2_SIZE;
1207 break;
1208
1209 /* Zero or more repeats of character types set the bits and then
1210 try again. */
1211
1212 case OP_TYPEUPTO:
1213 case OP_TYPEMINUPTO:
1214 case OP_TYPEPOSUPTO:
1215 tcode += IMM2_SIZE; /* Fall through */
1216
1217 case OP_TYPESTAR:
1218 case OP_TYPEMINSTAR:
1219 case OP_TYPEPOSSTAR:
1220 case OP_TYPEQUERY:
1221 case OP_TYPEMINQUERY:
1222 case OP_TYPEPOSQUERY:
1223 switch(tcode[1])
1224 {
1225 default:
1226 case OP_ANY:
1227 case OP_ALLANY:
1228 return SSB_FAIL;
1229
1230 case OP_HSPACE:
1231 SET_BIT(CHAR_HT);
1232 SET_BIT(CHAR_SPACE);
1233#ifdef SUPPORT_UTF
1234 if (utf)
1235 {
1236#ifdef COMPILE_PCRE8
1237 SET_BIT(0xC2); /* For U+00A0 */
1238 SET_BIT(0xE1); /* For U+1680, U+180E */
1239 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1240 SET_BIT(0xE3); /* For U+3000 */
1241#elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1242 SET_BIT(0xA0);
1243 SET_BIT(0xFF); /* For characters > 255 */
1244#endif /* COMPILE_PCRE[8|16|32] */
1245 }
1246 else
1247#endif /* SUPPORT_UTF */
1248#ifndef EBCDIC
1249 SET_BIT(0xA0);
1250#endif /* Not EBCDIC */
1251 break;
1252
1253 case OP_ANYNL:
1254 case OP_VSPACE:
1255 SET_BIT(CHAR_LF);
1256 SET_BIT(CHAR_VT);
1257 SET_BIT(CHAR_FF);
1258 SET_BIT(CHAR_CR);
1259#ifdef SUPPORT_UTF
1260 if (utf)
1261 {
1262#ifdef COMPILE_PCRE8
1263 SET_BIT(0xC2); /* For U+0085 */
1264 SET_BIT(0xE2); /* For U+2028, U+2029 */
1265#elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1266 SET_BIT(CHAR_NEL);
1267 SET_BIT(0xFF); /* For characters > 255 */
1268#endif /* COMPILE_PCRE16 */
1269 }
1270 else
1271#endif /* SUPPORT_UTF */
1272 SET_BIT(CHAR_NEL);
1273 break;
1274
1275 case OP_NOT_DIGIT:
1276 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1277 break;
1278
1279 case OP_DIGIT:
1280 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1281 break;
1282
1283 /* The cbit_space table has vertical tab as whitespace; we no longer
1284 have to play fancy tricks because Perl added VT to its whitespace at
1285 release 5.18. PCRE added it at release 8.34. */
1286
1287 case OP_NOT_WHITESPACE:
1288 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1289 break;
1290
1291 case OP_WHITESPACE:
1292 set_type_bits(start_bits, cbit_space, table_limit, cd);
1293 break;
1294
1295 case OP_NOT_WORDCHAR:
1296 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1297 break;
1298
1299 case OP_WORDCHAR:
1300 set_type_bits(start_bits, cbit_word, table_limit, cd);
1301 break;
1302 }
1303
1304 tcode += 2;
1305 break;
1306
1307 /* Character class where all the information is in a bit map: set the
1308 bits and either carry on or not, according to the repeat count. If it was
1309 a negative class, and we are operating with UTF-8 characters, any byte
1310 with a value >= 0xc4 is a potentially valid starter because it starts a
1311 character with a value > 255. */
1312
1313#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1314 case OP_XCLASS:
1315 if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
1316 return SSB_FAIL;
1317 /* All bits are set. */
1318 if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
1319 return SSB_FAIL;
1320#endif
1321 /* Fall through */
1322
1323 case OP_NCLASS:
1324#if defined SUPPORT_UTF && defined COMPILE_PCRE8
1325 if (utf)
1326 {
1327 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1328 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1329 }
1330#endif
1331#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1332 SET_BIT(0xFF); /* For characters > 255 */
1333#endif
1334 /* Fall through */
1335
1336 case OP_CLASS:
1337 {
1338 pcre_uint8 *map;
1339#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1340 map = NULL;
1341 if (*tcode == OP_XCLASS)
1342 {
1343 if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
1344 map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
1345 tcode += GET(tcode, 1);
1346 }
1347 else
1348#endif
1349 {
1350 tcode++;
1351 map = (pcre_uint8 *)tcode;
1352 tcode += 32 / sizeof(pcre_uchar);
1353 }
1354
1355 /* In UTF-8 mode, the bits in a bit map correspond to character
1356 values, not to byte values. However, the bit map we are constructing is
1357 for byte values. So we have to do a conversion for characters whose
1358 value is > 127. In fact, there are only two possible starting bytes for
1359 characters in the range 128 - 255. */
1360
1361#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1362 if (map != NULL)
1363#endif
1364 {
1365#if defined SUPPORT_UTF && defined COMPILE_PCRE8
1366 if (utf)
1367 {
1368 for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1369 for (c = 128; c < 256; c++)
1370 {
1371 if ((map[c/8] & (1 << (c&7))) != 0)
1372 {
1373 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1374 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1375 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1376 }
1377 }
1378 }
1379 else
1380#endif
1381 {
1382 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1383 for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1384 }
1385 }
1386
1387 /* Advance past the bit map, and act on what follows. For a zero
1388 minimum repeat, continue; otherwise stop processing. */
1389
1390 switch (*tcode)
1391 {
1392 case OP_CRSTAR:
1393 case OP_CRMINSTAR:
1394 case OP_CRQUERY:
1395 case OP_CRMINQUERY:
1396 case OP_CRPOSSTAR:
1397 case OP_CRPOSQUERY:
1398 tcode++;
1399 break;
1400
1401 case OP_CRRANGE:
1402 case OP_CRMINRANGE:
1403 case OP_CRPOSRANGE:
1404 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1405 else try_next = FALSE;
1406 break;
1407
1408 default:
1409 try_next = FALSE;
1410 break;
1411 }
1412 }
1413 break; /* End of bitmap class handling */
1414
1415 } /* End of switch */
1416 } /* End of try_next loop */
1417
1418 code += GET(code, 1); /* Advance to next branch */
1419 }
1420while (*code == OP_ALT);
1421return yield;
1422}
1423
1424
1425
1426
1427
1428/*************************************************
1429* Study a compiled expression *
1430*************************************************/
1431
1432/* This function is handed a compiled expression that it must study to produce
1433information that will speed up the matching. It returns a pcre[16]_extra block
1434which then gets handed back to pcre_exec().
1435
1436Arguments:
1437 re points to the compiled expression
1438 options contains option bits
1439 errorptr points to where to place error messages;
1440 set NULL unless error
1441
1442Returns: pointer to a pcre[16]_extra block, with study_data filled in and
1443 the appropriate flags set;
1444 NULL on error or if no optimization possible
1445*/
1446
1447#if defined COMPILE_PCRE8
1448PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1449pcre_study(const pcre *external_re, int options, const char **errorptr)
1450#elif defined COMPILE_PCRE16
1451PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1452pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1453#elif defined COMPILE_PCRE32
1454PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1455pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1456#endif
1457{
1458int min;
1459int count = 0;
1460BOOL bits_set = FALSE;
1461pcre_uint8 start_bits[32];
1462PUBL(extra) *extra = NULL;
1463pcre_study_data *study;
1464const pcre_uint8 *tables;
1465pcre_uchar *code;
1466compile_data compile_block;
1467const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1468
1469
1470*errorptr = NULL;
1471
1472if (re == NULL || re->magic_number != MAGIC_NUMBER)
1473 {
1474 *errorptr = "argument is not a compiled regular expression";
1475 return NULL;
1476 }
1477
1478if ((re->flags & PCRE_MODE) == 0)
1479 {
1480#if defined COMPILE_PCRE8
1481 *errorptr = "argument not compiled in 8 bit mode";
1482#elif defined COMPILE_PCRE16
1483 *errorptr = "argument not compiled in 16 bit mode";
1484#elif defined COMPILE_PCRE32
1485 *errorptr = "argument not compiled in 32 bit mode";
1486#endif
1487 return NULL;
1488 }
1489
1490if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1491 {
1492 *errorptr = "unknown or incorrect option bit(s) set";
1493 return NULL;
1494 }
1495
1496code = (pcre_uchar *)re + re->name_table_offset +
1497 (re->name_count * re->name_entry_size);
1498
1499/* For an anchored pattern, or an unanchored pattern that has a first char, or
1500a multiline pattern that matches only at "line starts", there is no point in
1501seeking a list of starting bytes. */
1502
1503if ((re->options & PCRE_ANCHORED) == 0 &&
1504 (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1505 {
1506 int rc;
1507
1508 /* Set the character tables in the block that is passed around */
1509
1510 tables = re->tables;
1511
1512#if defined COMPILE_PCRE8
1513 if (tables == NULL)
1514 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1515 (void *)(&tables));
1516#elif defined COMPILE_PCRE16
1517 if (tables == NULL)
1518 (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1519 (void *)(&tables));
1520#elif defined COMPILE_PCRE32
1521 if (tables == NULL)
1522 (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1523 (void *)(&tables));
1524#endif
1525
1526 compile_block.lcc = tables + lcc_offset;
1527 compile_block.fcc = tables + fcc_offset;
1528 compile_block.cbits = tables + cbits_offset;
1529 compile_block.ctypes = tables + ctypes_offset;
1530
1531 /* See if we can find a fixed set of initial characters for the pattern. */
1532
1533 memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1534 rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1535 &compile_block);
1536 bits_set = rc == SSB_DONE;
1537 if (rc == SSB_UNKNOWN)
1538 {
1539 *errorptr = "internal error: opcode not recognized";
1540 return NULL;
1541 }
1542 }
1543
1544/* Find the minimum length of subject string. */
1545
1546switch(min = find_minlength(re, code, code, re->options, NULL, &count))
1547 {
1548 case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1549 case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1550 default: break;
1551 }
1552
1553/* If a set of starting bytes has been identified, or if the minimum length is
1554greater than zero, or if JIT optimization has been requested, or if
1555PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1556pcre_study_data block. The study data is put in the latter, which is pointed to
1557by the former, which may also get additional data set later by the calling
1558program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1559save it in a field for returning via the pcre_fullinfo() function so that if it
1560becomes variable in the future, we don't have to change that code. */
1561
1562if (bits_set || min > 0 || (options & (
1563#ifdef SUPPORT_JIT
1564 PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1565 PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1566#endif
1567 PCRE_STUDY_EXTRA_NEEDED)) != 0)
1568 {
1569 extra = (PUBL(extra) *)(PUBL(malloc))
1570 (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1571 if (extra == NULL)
1572 {
1573 *errorptr = "failed to get memory";
1574 return NULL;
1575 }
1576
1577 study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1578 extra->flags = PCRE_EXTRA_STUDY_DATA;
1579 extra->study_data = study;
1580
1581 study->size = sizeof(pcre_study_data);
1582 study->flags = 0;
1583
1584 /* Set the start bits always, to avoid unset memory errors if the
1585 study data is written to a file, but set the flag only if any of the bits
1586 are set, to save time looking when none are. */
1587
1588 if (bits_set)
1589 {
1590 study->flags |= PCRE_STUDY_MAPPED;
1591 memcpy(study->start_bits, start_bits, sizeof(start_bits));
1592 }
1593 else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1594
1595#ifdef PCRE_DEBUG
1596 if (bits_set)
1597 {
1598 pcre_uint8 *ptr = start_bits;
1599 int i;
1600
1601 printf("Start bits:\n");
1602 for (i = 0; i < 32; i++)
1603 printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1604 }
1605#endif
1606
1607 /* Always set the minlength value in the block, because the JIT compiler
1608 makes use of it. However, don't set the bit unless the length is greater than
1609 zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1610 checking the zero case. */
1611
1612 if (min > 0)
1613 {
1614 study->flags |= PCRE_STUDY_MINLEN;
1615 study->minlength = min;
1616 }
1617 else study->minlength = 0;
1618
1619 /* If JIT support was compiled and requested, attempt the JIT compilation.
1620 If no starting bytes were found, and the minimum length is zero, and JIT
1621 compilation fails, abandon the extra block and return NULL, unless
1622 PCRE_STUDY_EXTRA_NEEDED is set. */
1623
1624#ifdef SUPPORT_JIT
1625 extra->executable_jit = NULL;
1626 if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1627 PRIV(jit_compile)(re, extra, JIT_COMPILE);
1628 if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1629 PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1630 if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1631 PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1632
1633 if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1634 (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1635 {
1636#if defined COMPILE_PCRE8
1637 pcre_free_study(extra);
1638#elif defined COMPILE_PCRE16
1639 pcre16_free_study(extra);
1640#elif defined COMPILE_PCRE32
1641 pcre32_free_study(extra);
1642#endif
1643 extra = NULL;
1644 }
1645#endif
1646 }
1647
1648return extra;
1649}
1650
1651
1652/*************************************************
1653* Free the study data *
1654*************************************************/
1655
1656/* This function frees the memory that was obtained by pcre_study().
1657
1658Argument: a pointer to the pcre[16]_extra block
1659Returns: nothing
1660*/
1661
1662#if defined COMPILE_PCRE8
1663PCRE_EXP_DEFN void
1664pcre_free_study(pcre_extra *extra)
1665#elif defined COMPILE_PCRE16
1666PCRE_EXP_DEFN void
1667pcre16_free_study(pcre16_extra *extra)
1668#elif defined COMPILE_PCRE32
1669PCRE_EXP_DEFN void
1670pcre32_free_study(pcre32_extra *extra)
1671#endif
1672{
1673if (extra == NULL)
1674 return;
1675#ifdef SUPPORT_JIT
1676if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1677 extra->executable_jit != NULL)
1678 PRIV(jit_free)(extra->executable_jit);
1679#endif
1680PUBL(free)(extra);
1681}
1682
1683/* End of pcre_study.c */
1684