1 | /************************************************* |
2 | * Perl-Compatible Regular Expressions * |
3 | *************************************************/ |
4 | |
5 | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | and semantics are as close as possible to those of the Perl 5 language. |
7 | |
8 | Written by Philip Hazel |
9 | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | New API code Copyright (c) 2016-2018 University of Cambridge |
11 | |
12 | ----------------------------------------------------------------------------- |
13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions are met: |
15 | |
16 | * Redistributions of source code must retain the above copyright notice, |
17 | this list of conditions and the following disclaimer. |
18 | |
19 | * Redistributions in binary form must reproduce the above copyright |
20 | notice, this list of conditions and the following disclaimer in the |
21 | documentation and/or other materials provided with the distribution. |
22 | |
23 | * Neither the name of the University of Cambridge nor the names of its |
24 | contributors may be used to endorse or promote products derived from |
25 | this software without specific prior written permission. |
26 | |
27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | POSSIBILITY OF SUCH DAMAGE. |
38 | ----------------------------------------------------------------------------- |
39 | */ |
40 | |
41 | |
42 | /* This module contains a single function that scans through a compiled pattern |
43 | until it finds a capturing bracket with the given number, or, if the number is |
44 | negative, an instance of OP_REVERSE for a lookbehind. The function is called |
45 | from pcre2_compile.c and also from pcre2_study.c when finding the minimum |
46 | matching length. */ |
47 | |
48 | |
49 | #ifdef HAVE_CONFIG_H |
50 | #include "config.h" |
51 | #endif |
52 | |
53 | #include "pcre2_internal.h" |
54 | |
55 | |
56 | /************************************************* |
57 | * Scan compiled regex for specific bracket * |
58 | *************************************************/ |
59 | |
60 | /* |
61 | Arguments: |
62 | code points to start of expression |
63 | utf TRUE in UTF mode |
64 | number the required bracket number or negative to find a lookbehind |
65 | |
66 | Returns: pointer to the opcode for the bracket, or NULL if not found |
67 | */ |
68 | |
69 | PCRE2_SPTR |
70 | PRIV(find_bracket)(PCRE2_SPTR code, BOOL utf, int number) |
71 | { |
72 | for (;;) |
73 | { |
74 | PCRE2_UCHAR c = *code; |
75 | |
76 | if (c == OP_END) return NULL; |
77 | |
78 | /* XCLASS is used for classes that cannot be represented just by a bit map. |
79 | This includes negated single high-valued characters. CALLOUT_STR is used for |
80 | callouts with string arguments. In both cases the length in the table is |
81 | zero; the actual length is stored in the compiled code. */ |
82 | |
83 | if (c == OP_XCLASS) code += GET(code, 1); |
84 | else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE); |
85 | |
86 | /* Handle lookbehind */ |
87 | |
88 | else if (c == OP_REVERSE) |
89 | { |
90 | if (number < 0) return (PCRE2_UCHAR *)code; |
91 | code += PRIV(OP_lengths)[c]; |
92 | } |
93 | |
94 | /* Handle capturing bracket */ |
95 | |
96 | else if (c == OP_CBRA || c == OP_SCBRA || |
97 | c == OP_CBRAPOS || c == OP_SCBRAPOS) |
98 | { |
99 | int n = (int)GET2(code, 1+LINK_SIZE); |
100 | if (n == number) return (PCRE2_UCHAR *)code; |
101 | code += PRIV(OP_lengths)[c]; |
102 | } |
103 | |
104 | /* Otherwise, we can get the item's length from the table, except that for |
105 | repeated character types, we have to test for \p and \P, which have an extra |
106 | two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we |
107 | must add in its length. */ |
108 | |
109 | else |
110 | { |
111 | switch(c) |
112 | { |
113 | case OP_TYPESTAR: |
114 | case OP_TYPEMINSTAR: |
115 | case OP_TYPEPLUS: |
116 | case OP_TYPEMINPLUS: |
117 | case OP_TYPEQUERY: |
118 | case OP_TYPEMINQUERY: |
119 | case OP_TYPEPOSSTAR: |
120 | case OP_TYPEPOSPLUS: |
121 | case OP_TYPEPOSQUERY: |
122 | if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
123 | break; |
124 | |
125 | case OP_TYPEUPTO: |
126 | case OP_TYPEMINUPTO: |
127 | case OP_TYPEEXACT: |
128 | case OP_TYPEPOSUPTO: |
129 | if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) |
130 | code += 2; |
131 | break; |
132 | |
133 | case OP_MARK: |
134 | case OP_COMMIT_ARG: |
135 | case OP_PRUNE_ARG: |
136 | case OP_SKIP_ARG: |
137 | case OP_THEN_ARG: |
138 | code += code[1]; |
139 | break; |
140 | } |
141 | |
142 | /* Add in the fixed length from the table */ |
143 | |
144 | code += PRIV(OP_lengths)[c]; |
145 | |
146 | /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be |
147 | followed by a multi-byte character. The length in the table is a minimum, so |
148 | we have to arrange to skip the extra bytes. */ |
149 | |
150 | #ifdef MAYBE_UTF_MULTI |
151 | if (utf) switch(c) |
152 | { |
153 | case OP_CHAR: |
154 | case OP_CHARI: |
155 | case OP_NOT: |
156 | case OP_NOTI: |
157 | case OP_EXACT: |
158 | case OP_EXACTI: |
159 | case OP_NOTEXACT: |
160 | case OP_NOTEXACTI: |
161 | case OP_UPTO: |
162 | case OP_UPTOI: |
163 | case OP_NOTUPTO: |
164 | case OP_NOTUPTOI: |
165 | case OP_MINUPTO: |
166 | case OP_MINUPTOI: |
167 | case OP_NOTMINUPTO: |
168 | case OP_NOTMINUPTOI: |
169 | case OP_POSUPTO: |
170 | case OP_POSUPTOI: |
171 | case OP_NOTPOSUPTO: |
172 | case OP_NOTPOSUPTOI: |
173 | case OP_STAR: |
174 | case OP_STARI: |
175 | case OP_NOTSTAR: |
176 | case OP_NOTSTARI: |
177 | case OP_MINSTAR: |
178 | case OP_MINSTARI: |
179 | case OP_NOTMINSTAR: |
180 | case OP_NOTMINSTARI: |
181 | case OP_POSSTAR: |
182 | case OP_POSSTARI: |
183 | case OP_NOTPOSSTAR: |
184 | case OP_NOTPOSSTARI: |
185 | case OP_PLUS: |
186 | case OP_PLUSI: |
187 | case OP_NOTPLUS: |
188 | case OP_NOTPLUSI: |
189 | case OP_MINPLUS: |
190 | case OP_MINPLUSI: |
191 | case OP_NOTMINPLUS: |
192 | case OP_NOTMINPLUSI: |
193 | case OP_POSPLUS: |
194 | case OP_POSPLUSI: |
195 | case OP_NOTPOSPLUS: |
196 | case OP_NOTPOSPLUSI: |
197 | case OP_QUERY: |
198 | case OP_QUERYI: |
199 | case OP_NOTQUERY: |
200 | case OP_NOTQUERYI: |
201 | case OP_MINQUERY: |
202 | case OP_MINQUERYI: |
203 | case OP_NOTMINQUERY: |
204 | case OP_NOTMINQUERYI: |
205 | case OP_POSQUERY: |
206 | case OP_POSQUERYI: |
207 | case OP_NOTPOSQUERY: |
208 | case OP_NOTPOSQUERYI: |
209 | if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); |
210 | break; |
211 | } |
212 | #else |
213 | (void)(utf); /* Keep compiler happy by referencing function argument */ |
214 | #endif /* MAYBE_UTF_MULTI */ |
215 | } |
216 | } |
217 | } |
218 | |
219 | /* End of pcre2_find_bracket.c */ |
220 | |