1 | /************************************************* |
2 | * Perl-Compatible Regular Expressions * |
3 | *************************************************/ |
4 | |
5 | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | and semantics are as close as possible to those of the Perl 5 language. |
7 | |
8 | Written by Philip Hazel |
9 | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | New API code Copyright (c) 2016-2022 University of Cambridge |
11 | |
12 | ----------------------------------------------------------------------------- |
13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions are met: |
15 | |
16 | * Redistributions of source code must retain the above copyright notice, |
17 | this list of conditions and the following disclaimer. |
18 | |
19 | * Redistributions in binary form must reproduce the above copyright |
20 | notice, this list of conditions and the following disclaimer in the |
21 | documentation and/or other materials provided with the distribution. |
22 | |
23 | * Neither the name of the University of Cambridge nor the names of its |
24 | contributors may be used to endorse or promote products derived from |
25 | this software without specific prior written permission. |
26 | |
27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | POSSIBILITY OF SUCH DAMAGE. |
38 | ----------------------------------------------------------------------------- |
39 | */ |
40 | |
41 | |
42 | #ifdef HAVE_CONFIG_H |
43 | #include "config.h" |
44 | #endif |
45 | |
46 | #include "pcre2_internal.h" |
47 | |
48 | #define PTR_STACK_SIZE 20 |
49 | |
50 | #define SUBSTITUTE_OPTIONS \ |
51 | (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \ |
52 | PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \ |
53 | PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \ |
54 | PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY) |
55 | |
56 | |
57 | |
58 | /************************************************* |
59 | * Find end of substitute text * |
60 | *************************************************/ |
61 | |
62 | /* In extended mode, we recognize ${name:+set text:unset text} and similar |
63 | constructions. This requires the identification of unescaped : and } |
64 | characters. This function scans for such. It must deal with nested ${ |
65 | constructions. The pointer to the text is updated, either to the required end |
66 | character, or to where an error was detected. |
67 | |
68 | Arguments: |
69 | code points to the compiled expression (for options) |
70 | ptrptr points to the pointer to the start of the text (updated) |
71 | ptrend end of the whole string |
72 | last TRUE if the last expected string (only } recognized) |
73 | |
74 | Returns: 0 on success |
75 | negative error code on failure |
76 | */ |
77 | |
78 | static int |
79 | find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, |
80 | BOOL last) |
81 | { |
82 | int rc = 0; |
83 | uint32_t nestlevel = 0; |
84 | BOOL literal = FALSE; |
85 | PCRE2_SPTR ptr = *ptrptr; |
86 | |
87 | for (; ptr < ptrend; ptr++) |
88 | { |
89 | if (literal) |
90 | { |
91 | if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E) |
92 | { |
93 | literal = FALSE; |
94 | ptr += 1; |
95 | } |
96 | } |
97 | |
98 | else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) |
99 | { |
100 | if (nestlevel == 0) goto EXIT; |
101 | nestlevel--; |
102 | } |
103 | |
104 | else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT; |
105 | |
106 | else if (*ptr == CHAR_DOLLAR_SIGN) |
107 | { |
108 | if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET) |
109 | { |
110 | nestlevel++; |
111 | ptr += 1; |
112 | } |
113 | } |
114 | |
115 | else if (*ptr == CHAR_BACKSLASH) |
116 | { |
117 | int erc; |
118 | int errorcode; |
119 | uint32_t ch; |
120 | |
121 | if (ptr < ptrend - 1) switch (ptr[1]) |
122 | { |
123 | case CHAR_L: |
124 | case CHAR_l: |
125 | case CHAR_U: |
126 | case CHAR_u: |
127 | ptr += 1; |
128 | continue; |
129 | } |
130 | |
131 | ptr += 1; /* Must point after \ */ |
132 | erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode, |
133 | code->overall_options, code->extra_options, FALSE, NULL); |
134 | ptr -= 1; /* Back to last code unit of escape */ |
135 | if (errorcode != 0) |
136 | { |
137 | rc = errorcode; |
138 | goto EXIT; |
139 | } |
140 | |
141 | switch(erc) |
142 | { |
143 | case 0: /* Data character */ |
144 | case ESC_E: /* Isolated \E is ignored */ |
145 | break; |
146 | |
147 | case ESC_Q: |
148 | literal = TRUE; |
149 | break; |
150 | |
151 | default: |
152 | rc = PCRE2_ERROR_BADREPESCAPE; |
153 | goto EXIT; |
154 | } |
155 | } |
156 | } |
157 | |
158 | rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */ |
159 | |
160 | EXIT: |
161 | *ptrptr = ptr; |
162 | return rc; |
163 | } |
164 | |
165 | |
166 | |
167 | /************************************************* |
168 | * Match and substitute * |
169 | *************************************************/ |
170 | |
171 | /* This function applies a compiled re to a subject string and creates a new |
172 | string with substitutions. The first 7 arguments are the same as for |
173 | pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED. |
174 | |
175 | Arguments: |
176 | code points to the compiled expression |
177 | subject points to the subject string |
178 | length length of subject string (may contain binary zeros) |
179 | start_offset where to start in the subject string |
180 | options option bits |
181 | match_data points to a match_data block, or is NULL |
182 | context points a PCRE2 context |
183 | replacement points to the replacement string |
184 | rlength length of replacement string |
185 | buffer where to put the substituted string |
186 | blength points to length of buffer; updated to length of string |
187 | |
188 | Returns: >= 0 number of substitutions made |
189 | < 0 an error code |
190 | PCRE2_ERROR_BADREPLACEMENT means invalid use of $ |
191 | */ |
192 | |
193 | /* This macro checks for space in the buffer before copying into it. On |
194 | overflow, either give an error immediately, or keep on, accumulating the |
195 | length. */ |
196 | |
197 | #define CHECKMEMCPY(from,length) \ |
198 | { \ |
199 | if (!overflowed && lengthleft < length) \ |
200 | { \ |
201 | if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \ |
202 | overflowed = TRUE; \ |
203 | extra_needed = length - lengthleft; \ |
204 | } \ |
205 | else if (overflowed) \ |
206 | { \ |
207 | extra_needed += length; \ |
208 | } \ |
209 | else \ |
210 | { \ |
211 | memcpy(buffer + buff_offset, from, CU2BYTES(length)); \ |
212 | buff_offset += length; \ |
213 | lengthleft -= length; \ |
214 | } \ |
215 | } |
216 | |
217 | /* Here's the function */ |
218 | |
219 | PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION |
220 | pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, |
221 | PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, |
222 | pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength, |
223 | PCRE2_UCHAR *buffer, PCRE2_SIZE *blength) |
224 | { |
225 | int rc; |
226 | int subs; |
227 | int forcecase = 0; |
228 | int forcecasereset = 0; |
229 | uint32_t ovector_count; |
230 | uint32_t goptions = 0; |
231 | uint32_t suboptions; |
232 | pcre2_match_data *internal_match_data = NULL; |
233 | BOOL escaped_literal = FALSE; |
234 | BOOL overflowed = FALSE; |
235 | BOOL use_existing_match; |
236 | BOOL replacement_only; |
237 | #ifdef SUPPORT_UNICODE |
238 | BOOL utf = (code->overall_options & PCRE2_UTF) != 0; |
239 | BOOL ucp = (code->overall_options & PCRE2_UCP) != 0; |
240 | #endif |
241 | PCRE2_UCHAR temp[6]; |
242 | PCRE2_SPTR ptr; |
243 | PCRE2_SPTR repend; |
244 | PCRE2_SIZE = 0; |
245 | PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength; |
246 | PCRE2_SIZE *ovector; |
247 | PCRE2_SIZE ovecsave[3]; |
248 | pcre2_substitute_callout_block scb; |
249 | |
250 | /* General initialization */ |
251 | |
252 | buff_offset = 0; |
253 | lengthleft = buff_length = *blength; |
254 | *blength = PCRE2_UNSET; |
255 | ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET; |
256 | |
257 | /* Partial matching is not valid. This must come after setting *blength to |
258 | PCRE2_UNSET, so as not to imply an offset in the replacement. */ |
259 | |
260 | if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0) |
261 | return PCRE2_ERROR_BADOPTION; |
262 | |
263 | /* Validate length and find the end of the replacement. A NULL replacement of |
264 | zero length is interpreted as an empty string. */ |
265 | |
266 | if (replacement == NULL) |
267 | { |
268 | if (rlength != 0) return PCRE2_ERROR_NULL; |
269 | replacement = (PCRE2_SPTR)"" ; |
270 | } |
271 | |
272 | if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement); |
273 | repend = replacement + rlength; |
274 | |
275 | /* Check for using a match that has already happened. Note that the subject |
276 | pointer in the match data may be NULL after a no-match. */ |
277 | |
278 | use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0); |
279 | replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0); |
280 | |
281 | /* If starting from an existing match, there must be an externally provided |
282 | match data block. We create an internal match_data block in two cases: (a) an |
283 | external one is not supplied (and we are not starting from an existing match); |
284 | (b) an existing match is to be used for the first substitution. In the latter |
285 | case, we copy the existing match into the internal block, except for any cached |
286 | heap frame size and pointer. This ensures that no changes are made to the |
287 | external match data block. */ |
288 | |
289 | if (match_data == NULL) |
290 | { |
291 | pcre2_general_context *gcontext; |
292 | if (use_existing_match) return PCRE2_ERROR_NULL; |
293 | gcontext = (mcontext == NULL)? |
294 | (pcre2_general_context *)code : |
295 | (pcre2_general_context *)mcontext; |
296 | match_data = internal_match_data = |
297 | pcre2_match_data_create_from_pattern(code, gcontext); |
298 | if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; |
299 | } |
300 | |
301 | else if (use_existing_match) |
302 | { |
303 | pcre2_general_context *gcontext = (mcontext == NULL)? |
304 | (pcre2_general_context *)code : |
305 | (pcre2_general_context *)mcontext; |
306 | int pairs = (code->top_bracket + 1 < match_data->oveccount)? |
307 | code->top_bracket + 1 : match_data->oveccount; |
308 | internal_match_data = pcre2_match_data_create(match_data->oveccount, |
309 | gcontext); |
310 | if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; |
311 | memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector) |
312 | + 2*pairs*sizeof(PCRE2_SIZE)); |
313 | internal_match_data->heapframes = NULL; |
314 | internal_match_data->heapframes_size = 0; |
315 | match_data = internal_match_data; |
316 | } |
317 | |
318 | /* Remember ovector details */ |
319 | |
320 | ovector = pcre2_get_ovector_pointer(match_data); |
321 | ovector_count = pcre2_get_ovector_count(match_data); |
322 | |
323 | /* Fixed things in the callout block */ |
324 | |
325 | scb.version = 0; |
326 | scb.input = subject; |
327 | scb.output = (PCRE2_SPTR)buffer; |
328 | scb.ovector = ovector; |
329 | |
330 | /* A NULL subject of zero length is treated as an empty string. */ |
331 | |
332 | if (subject == NULL) |
333 | { |
334 | if (length != 0) return PCRE2_ERROR_NULL; |
335 | subject = (PCRE2_SPTR)"" ; |
336 | } |
337 | |
338 | /* Find length of zero-terminated subject */ |
339 | |
340 | if (length == PCRE2_ZERO_TERMINATED) |
341 | length = subject? PRIV(strlen)(subject) : 0; |
342 | |
343 | /* Check UTF replacement string if necessary. */ |
344 | |
345 | #ifdef SUPPORT_UNICODE |
346 | if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) |
347 | { |
348 | rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar)); |
349 | if (rc != 0) |
350 | { |
351 | match_data->leftchar = 0; |
352 | goto EXIT; |
353 | } |
354 | } |
355 | #endif /* SUPPORT_UNICODE */ |
356 | |
357 | /* Save the substitute options and remove them from the match options. */ |
358 | |
359 | suboptions = options & SUBSTITUTE_OPTIONS; |
360 | options &= ~SUBSTITUTE_OPTIONS; |
361 | |
362 | /* Error if the start match offset is greater than the length of the subject. */ |
363 | |
364 | if (start_offset > length) |
365 | { |
366 | match_data->leftchar = 0; |
367 | rc = PCRE2_ERROR_BADOFFSET; |
368 | goto EXIT; |
369 | } |
370 | |
371 | /* Copy up to the start offset, unless only the replacement is required. */ |
372 | |
373 | if (!replacement_only) CHECKMEMCPY(subject, start_offset); |
374 | |
375 | /* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first |
376 | match is taken from the match_data that was passed in. */ |
377 | |
378 | subs = 0; |
379 | do |
380 | { |
381 | PCRE2_SPTR ptrstack[PTR_STACK_SIZE]; |
382 | uint32_t ptrstackptr = 0; |
383 | |
384 | if (use_existing_match) |
385 | { |
386 | rc = match_data->rc; |
387 | use_existing_match = FALSE; |
388 | } |
389 | else rc = pcre2_match(code, subject, length, start_offset, options|goptions, |
390 | match_data, mcontext); |
391 | |
392 | #ifdef SUPPORT_UNICODE |
393 | if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */ |
394 | #endif |
395 | |
396 | /* Any error other than no match returns the error code. No match when not |
397 | doing the special after-empty-match global rematch, or when at the end of the |
398 | subject, breaks the global loop. Otherwise, advance the starting point by one |
399 | character, copying it to the output, and try again. */ |
400 | |
401 | if (rc < 0) |
402 | { |
403 | PCRE2_SIZE save_start; |
404 | |
405 | if (rc != PCRE2_ERROR_NOMATCH) goto EXIT; |
406 | if (goptions == 0 || start_offset >= length) break; |
407 | |
408 | /* Advance by one code point. Then, if CRLF is a valid newline sequence and |
409 | we have advanced into the middle of it, advance one more code point. In |
410 | other words, do not start in the middle of CRLF, even if CR and LF on their |
411 | own are valid newlines. */ |
412 | |
413 | save_start = start_offset++; |
414 | if (subject[start_offset-1] == CHAR_CR && |
415 | code->newline_convention != PCRE2_NEWLINE_CR && |
416 | code->newline_convention != PCRE2_NEWLINE_LF && |
417 | start_offset < length && |
418 | subject[start_offset] == CHAR_LF) |
419 | start_offset++; |
420 | |
421 | /* Otherwise, in UTF mode, advance past any secondary code points. */ |
422 | |
423 | else if ((code->overall_options & PCRE2_UTF) != 0) |
424 | { |
425 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
426 | while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80) |
427 | start_offset++; |
428 | #elif PCRE2_CODE_UNIT_WIDTH == 16 |
429 | while (start_offset < length && |
430 | (subject[start_offset] & 0xfc00) == 0xdc00) |
431 | start_offset++; |
432 | #endif |
433 | } |
434 | |
435 | /* Copy what we have advanced past (unless not required), reset the special |
436 | global options, and continue to the next match. */ |
437 | |
438 | fraglength = start_offset - save_start; |
439 | if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength); |
440 | goptions = 0; |
441 | continue; |
442 | } |
443 | |
444 | /* Handle a successful match. Matches that use \K to end before they start |
445 | or start before the current point in the subject are not supported. */ |
446 | |
447 | if (ovector[1] < ovector[0] || ovector[0] < start_offset) |
448 | { |
449 | rc = PCRE2_ERROR_BADSUBSPATTERN; |
450 | goto EXIT; |
451 | } |
452 | |
453 | /* Check for the same match as previous. This is legitimate after matching an |
454 | empty string that starts after the initial match offset. We have tried again |
455 | at the match point in case the pattern is one like /(?<=\G.)/ which can never |
456 | match at its starting point, so running the match achieves the bumpalong. If |
457 | we do get the same (null) match at the original match point, it isn't such a |
458 | pattern, so we now do the empty string magic. In all other cases, a repeat |
459 | match should never occur. */ |
460 | |
461 | if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1]) |
462 | { |
463 | if (ovector[0] == ovector[1] && ovecsave[2] != start_offset) |
464 | { |
465 | goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; |
466 | ovecsave[2] = start_offset; |
467 | continue; /* Back to the top of the loop */ |
468 | } |
469 | rc = PCRE2_ERROR_INTERNAL_DUPMATCH; |
470 | goto EXIT; |
471 | } |
472 | |
473 | /* Count substitutions with a paranoid check for integer overflow; surely no |
474 | real call to this function would ever hit this! */ |
475 | |
476 | if (subs == INT_MAX) |
477 | { |
478 | rc = PCRE2_ERROR_TOOMANYREPLACE; |
479 | goto EXIT; |
480 | } |
481 | subs++; |
482 | |
483 | /* Copy the text leading up to the match (unless not required), and remember |
484 | where the insert begins and how many ovector pairs are set. */ |
485 | |
486 | if (rc == 0) rc = ovector_count; |
487 | fraglength = ovector[0] - start_offset; |
488 | if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength); |
489 | scb.output_offsets[0] = buff_offset; |
490 | scb.oveccount = rc; |
491 | |
492 | /* Process the replacement string. If the entire replacement is literal, just |
493 | copy it with length check. */ |
494 | |
495 | ptr = replacement; |
496 | if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0) |
497 | { |
498 | CHECKMEMCPY(ptr, rlength); |
499 | } |
500 | |
501 | /* Within a non-literal replacement, which must be scanned character by |
502 | character, local literal mode can be set by \Q, but only in extended mode |
503 | when backslashes are being interpreted. In extended mode we must handle |
504 | nested substrings that are to be reprocessed. */ |
505 | |
506 | else for (;;) |
507 | { |
508 | uint32_t ch; |
509 | unsigned int chlen; |
510 | |
511 | /* If at the end of a nested substring, pop the stack. */ |
512 | |
513 | if (ptr >= repend) |
514 | { |
515 | if (ptrstackptr == 0) break; /* End of replacement string */ |
516 | repend = ptrstack[--ptrstackptr]; |
517 | ptr = ptrstack[--ptrstackptr]; |
518 | continue; |
519 | } |
520 | |
521 | /* Handle the next character */ |
522 | |
523 | if (escaped_literal) |
524 | { |
525 | if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E) |
526 | { |
527 | escaped_literal = FALSE; |
528 | ptr += 2; |
529 | continue; |
530 | } |
531 | goto LOADLITERAL; |
532 | } |
533 | |
534 | /* Not in literal mode. */ |
535 | |
536 | if (*ptr == CHAR_DOLLAR_SIGN) |
537 | { |
538 | int group, n; |
539 | uint32_t special = 0; |
540 | BOOL inparens; |
541 | BOOL star; |
542 | PCRE2_SIZE sublength; |
543 | PCRE2_SPTR text1_start = NULL; |
544 | PCRE2_SPTR text1_end = NULL; |
545 | PCRE2_SPTR text2_start = NULL; |
546 | PCRE2_SPTR text2_end = NULL; |
547 | PCRE2_UCHAR next; |
548 | PCRE2_UCHAR name[33]; |
549 | |
550 | if (++ptr >= repend) goto BAD; |
551 | if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL; |
552 | |
553 | group = -1; |
554 | n = 0; |
555 | inparens = FALSE; |
556 | star = FALSE; |
557 | |
558 | if (next == CHAR_LEFT_CURLY_BRACKET) |
559 | { |
560 | if (++ptr >= repend) goto BAD; |
561 | next = *ptr; |
562 | inparens = TRUE; |
563 | } |
564 | |
565 | if (next == CHAR_ASTERISK) |
566 | { |
567 | if (++ptr >= repend) goto BAD; |
568 | next = *ptr; |
569 | star = TRUE; |
570 | } |
571 | |
572 | if (!star && next >= CHAR_0 && next <= CHAR_9) |
573 | { |
574 | group = next - CHAR_0; |
575 | while (++ptr < repend) |
576 | { |
577 | next = *ptr; |
578 | if (next < CHAR_0 || next > CHAR_9) break; |
579 | group = group * 10 + next - CHAR_0; |
580 | |
581 | /* A check for a number greater than the hightest captured group |
582 | is sufficient here; no need for a separate overflow check. If unknown |
583 | groups are to be treated as unset, just skip over any remaining |
584 | digits and carry on. */ |
585 | |
586 | if (group > code->top_bracket) |
587 | { |
588 | if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) |
589 | { |
590 | while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9); |
591 | break; |
592 | } |
593 | else |
594 | { |
595 | rc = PCRE2_ERROR_NOSUBSTRING; |
596 | goto PTREXIT; |
597 | } |
598 | } |
599 | } |
600 | } |
601 | else |
602 | { |
603 | const uint8_t *ctypes = code->tables + ctypes_offset; |
604 | while (MAX_255(next) && (ctypes[next] & ctype_word) != 0) |
605 | { |
606 | name[n++] = next; |
607 | if (n > 32) goto BAD; |
608 | if (++ptr >= repend) break; |
609 | next = *ptr; |
610 | } |
611 | if (n == 0) goto BAD; |
612 | name[n] = 0; |
613 | } |
614 | |
615 | /* In extended mode we recognize ${name:+set text:unset text} and |
616 | ${name:-default text}. */ |
617 | |
618 | if (inparens) |
619 | { |
620 | if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && |
621 | !star && ptr < repend - 2 && next == CHAR_COLON) |
622 | { |
623 | special = *(++ptr); |
624 | if (special != CHAR_PLUS && special != CHAR_MINUS) |
625 | { |
626 | rc = PCRE2_ERROR_BADSUBSTITUTION; |
627 | goto PTREXIT; |
628 | } |
629 | |
630 | text1_start = ++ptr; |
631 | rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS); |
632 | if (rc != 0) goto PTREXIT; |
633 | text1_end = ptr; |
634 | |
635 | if (special == CHAR_PLUS && *ptr == CHAR_COLON) |
636 | { |
637 | text2_start = ++ptr; |
638 | rc = find_text_end(code, &ptr, repend, TRUE); |
639 | if (rc != 0) goto PTREXIT; |
640 | text2_end = ptr; |
641 | } |
642 | } |
643 | |
644 | else |
645 | { |
646 | if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET) |
647 | { |
648 | rc = PCRE2_ERROR_REPMISSINGBRACE; |
649 | goto PTREXIT; |
650 | } |
651 | } |
652 | |
653 | ptr++; |
654 | } |
655 | |
656 | /* Have found a syntactically correct group number or name, or *name. |
657 | Only *MARK is currently recognized. */ |
658 | |
659 | if (star) |
660 | { |
661 | if (PRIV(strcmp_c8)(name, STRING_MARK) == 0) |
662 | { |
663 | PCRE2_SPTR mark = pcre2_get_mark(match_data); |
664 | if (mark != NULL) |
665 | { |
666 | PCRE2_SPTR mark_start = mark; |
667 | while (*mark != 0) mark++; |
668 | fraglength = mark - mark_start; |
669 | CHECKMEMCPY(mark_start, fraglength); |
670 | } |
671 | } |
672 | else goto BAD; |
673 | } |
674 | |
675 | /* Substitute the contents of a group. We don't use substring_copy |
676 | functions any more, in order to support case forcing. */ |
677 | |
678 | else |
679 | { |
680 | PCRE2_SPTR subptr, subptrend; |
681 | |
682 | /* Find a number for a named group. In case there are duplicate names, |
683 | search for the first one that is set. If the name is not found when |
684 | PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a |
685 | non-existent group. */ |
686 | |
687 | if (group < 0) |
688 | { |
689 | PCRE2_SPTR first, last, entry; |
690 | rc = pcre2_substring_nametable_scan(code, name, &first, &last); |
691 | if (rc == PCRE2_ERROR_NOSUBSTRING && |
692 | (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) |
693 | { |
694 | group = code->top_bracket + 1; |
695 | } |
696 | else |
697 | { |
698 | if (rc < 0) goto PTREXIT; |
699 | for (entry = first; entry <= last; entry += rc) |
700 | { |
701 | uint32_t ng = GET2(entry, 0); |
702 | if (ng < ovector_count) |
703 | { |
704 | if (group < 0) group = ng; /* First in ovector */ |
705 | if (ovector[ng*2] != PCRE2_UNSET) |
706 | { |
707 | group = ng; /* First that is set */ |
708 | break; |
709 | } |
710 | } |
711 | } |
712 | |
713 | /* If group is still negative, it means we did not find a group |
714 | that is in the ovector. Just set the first group. */ |
715 | |
716 | if (group < 0) group = GET2(first, 0); |
717 | } |
718 | } |
719 | |
720 | /* We now have a group that is identified by number. Find the length of |
721 | the captured string. If a group in a non-special substitution is unset |
722 | when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */ |
723 | |
724 | rc = pcre2_substring_length_bynumber(match_data, group, &sublength); |
725 | if (rc < 0) |
726 | { |
727 | if (rc == PCRE2_ERROR_NOSUBSTRING && |
728 | (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) |
729 | { |
730 | rc = PCRE2_ERROR_UNSET; |
731 | } |
732 | if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */ |
733 | if (special == 0) /* Plain substitution */ |
734 | { |
735 | if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue; |
736 | goto PTREXIT; /* Else error */ |
737 | } |
738 | } |
739 | |
740 | /* If special is '+' we have a 'set' and possibly an 'unset' text, |
741 | both of which are reprocessed when used. If special is '-' we have a |
742 | default text for when the group is unset; it must be reprocessed. */ |
743 | |
744 | if (special != 0) |
745 | { |
746 | if (special == CHAR_MINUS) |
747 | { |
748 | if (rc == 0) goto LITERAL_SUBSTITUTE; |
749 | text2_start = text1_start; |
750 | text2_end = text1_end; |
751 | } |
752 | |
753 | if (ptrstackptr >= PTR_STACK_SIZE) goto BAD; |
754 | ptrstack[ptrstackptr++] = ptr; |
755 | ptrstack[ptrstackptr++] = repend; |
756 | |
757 | if (rc == 0) |
758 | { |
759 | ptr = text1_start; |
760 | repend = text1_end; |
761 | } |
762 | else |
763 | { |
764 | ptr = text2_start; |
765 | repend = text2_end; |
766 | } |
767 | continue; |
768 | } |
769 | |
770 | /* Otherwise we have a literal substitution of a group's contents. */ |
771 | |
772 | LITERAL_SUBSTITUTE: |
773 | subptr = subject + ovector[group*2]; |
774 | subptrend = subject + ovector[group*2 + 1]; |
775 | |
776 | /* Substitute a literal string, possibly forcing alphabetic case. */ |
777 | |
778 | while (subptr < subptrend) |
779 | { |
780 | GETCHARINCTEST(ch, subptr); |
781 | if (forcecase != 0) |
782 | { |
783 | #ifdef SUPPORT_UNICODE |
784 | if (utf || ucp) |
785 | { |
786 | uint32_t type = UCD_CHARTYPE(ch); |
787 | if (PRIV(ucp_gentype)[type] == ucp_L && |
788 | type != ((forcecase > 0)? ucp_Lu : ucp_Ll)) |
789 | ch = UCD_OTHERCASE(ch); |
790 | } |
791 | else |
792 | #endif |
793 | { |
794 | if (((code->tables + cbits_offset + |
795 | ((forcecase > 0)? cbit_upper:cbit_lower) |
796 | )[ch/8] & (1u << (ch%8))) == 0) |
797 | ch = (code->tables + fcc_offset)[ch]; |
798 | } |
799 | forcecase = forcecasereset; |
800 | } |
801 | |
802 | #ifdef SUPPORT_UNICODE |
803 | if (utf) chlen = PRIV(ord2utf)(ch, temp); else |
804 | #endif |
805 | { |
806 | temp[0] = ch; |
807 | chlen = 1; |
808 | } |
809 | CHECKMEMCPY(temp, chlen); |
810 | } |
811 | } |
812 | } |
813 | |
814 | /* Handle an escape sequence in extended mode. We can use check_escape() |
815 | to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but |
816 | the case-forcing escapes are not supported in pcre2_compile() so must be |
817 | recognized here. */ |
818 | |
819 | else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && |
820 | *ptr == CHAR_BACKSLASH) |
821 | { |
822 | int errorcode; |
823 | |
824 | if (ptr < repend - 1) switch (ptr[1]) |
825 | { |
826 | case CHAR_L: |
827 | forcecase = forcecasereset = -1; |
828 | ptr += 2; |
829 | continue; |
830 | |
831 | case CHAR_l: |
832 | forcecase = -1; |
833 | forcecasereset = 0; |
834 | ptr += 2; |
835 | continue; |
836 | |
837 | case CHAR_U: |
838 | forcecase = forcecasereset = 1; |
839 | ptr += 2; |
840 | continue; |
841 | |
842 | case CHAR_u: |
843 | forcecase = 1; |
844 | forcecasereset = 0; |
845 | ptr += 2; |
846 | continue; |
847 | |
848 | default: |
849 | break; |
850 | } |
851 | |
852 | ptr++; /* Point after \ */ |
853 | rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode, |
854 | code->overall_options, code->extra_options, FALSE, NULL); |
855 | if (errorcode != 0) goto BADESCAPE; |
856 | |
857 | switch(rc) |
858 | { |
859 | case ESC_E: |
860 | forcecase = forcecasereset = 0; |
861 | continue; |
862 | |
863 | case ESC_Q: |
864 | escaped_literal = TRUE; |
865 | continue; |
866 | |
867 | case 0: /* Data character */ |
868 | goto LITERAL; |
869 | |
870 | default: |
871 | goto BADESCAPE; |
872 | } |
873 | } |
874 | |
875 | /* Handle a literal code unit */ |
876 | |
877 | else |
878 | { |
879 | LOADLITERAL: |
880 | GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */ |
881 | |
882 | LITERAL: |
883 | if (forcecase != 0) |
884 | { |
885 | #ifdef SUPPORT_UNICODE |
886 | if (utf || ucp) |
887 | { |
888 | uint32_t type = UCD_CHARTYPE(ch); |
889 | if (PRIV(ucp_gentype)[type] == ucp_L && |
890 | type != ((forcecase > 0)? ucp_Lu : ucp_Ll)) |
891 | ch = UCD_OTHERCASE(ch); |
892 | } |
893 | else |
894 | #endif |
895 | { |
896 | if (((code->tables + cbits_offset + |
897 | ((forcecase > 0)? cbit_upper:cbit_lower) |
898 | )[ch/8] & (1u << (ch%8))) == 0) |
899 | ch = (code->tables + fcc_offset)[ch]; |
900 | } |
901 | forcecase = forcecasereset; |
902 | } |
903 | |
904 | #ifdef SUPPORT_UNICODE |
905 | if (utf) chlen = PRIV(ord2utf)(ch, temp); else |
906 | #endif |
907 | { |
908 | temp[0] = ch; |
909 | chlen = 1; |
910 | } |
911 | CHECKMEMCPY(temp, chlen); |
912 | } /* End handling a literal code unit */ |
913 | } /* End of loop for scanning the replacement. */ |
914 | |
915 | /* The replacement has been copied to the output, or its size has been |
916 | remembered. Do the callout if there is one and we have done an actual |
917 | replacement. */ |
918 | |
919 | if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL) |
920 | { |
921 | scb.subscount = subs; |
922 | scb.output_offsets[1] = buff_offset; |
923 | rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data); |
924 | |
925 | /* A non-zero return means cancel this substitution. Instead, copy the |
926 | matched string fragment. */ |
927 | |
928 | if (rc != 0) |
929 | { |
930 | PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0]; |
931 | PCRE2_SIZE oldlength = ovector[1] - ovector[0]; |
932 | |
933 | buff_offset -= newlength; |
934 | lengthleft += newlength; |
935 | if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength); |
936 | |
937 | /* A negative return means do not do any more. */ |
938 | |
939 | if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL); |
940 | } |
941 | } |
942 | |
943 | /* Save the details of this match. See above for how this data is used. If we |
944 | matched an empty string, do the magic for global matches. Update the start |
945 | offset to point to the rest of the subject string. If we re-used an existing |
946 | match for the first match, switch to the internal match data block. */ |
947 | |
948 | ovecsave[0] = ovector[0]; |
949 | ovecsave[1] = ovector[1]; |
950 | ovecsave[2] = start_offset; |
951 | |
952 | goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 : |
953 | PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART; |
954 | start_offset = ovector[1]; |
955 | } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */ |
956 | |
957 | /* Copy the rest of the subject unless not required, and terminate the output |
958 | with a binary zero. */ |
959 | |
960 | if (!replacement_only) |
961 | { |
962 | fraglength = length - start_offset; |
963 | CHECKMEMCPY(subject + start_offset, fraglength); |
964 | } |
965 | |
966 | temp[0] = 0; |
967 | CHECKMEMCPY(temp, 1); |
968 | |
969 | /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set, |
970 | and matching has carried on after a full buffer, in order to compute the length |
971 | needed. Otherwise, an overflow generates an immediate error return. */ |
972 | |
973 | if (overflowed) |
974 | { |
975 | rc = PCRE2_ERROR_NOMEMORY; |
976 | *blength = buff_length + extra_needed; |
977 | } |
978 | |
979 | /* After a successful execution, return the number of substitutions and set the |
980 | length of buffer used, excluding the trailing zero. */ |
981 | |
982 | else |
983 | { |
984 | rc = subs; |
985 | *blength = buff_offset - 1; |
986 | } |
987 | |
988 | EXIT: |
989 | if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data); |
990 | else match_data->rc = rc; |
991 | return rc; |
992 | |
993 | NOROOM: |
994 | rc = PCRE2_ERROR_NOMEMORY; |
995 | goto EXIT; |
996 | |
997 | BAD: |
998 | rc = PCRE2_ERROR_BADREPLACEMENT; |
999 | goto PTREXIT; |
1000 | |
1001 | BADESCAPE: |
1002 | rc = PCRE2_ERROR_BADREPESCAPE; |
1003 | |
1004 | PTREXIT: |
1005 | *blength = (PCRE2_SIZE)(ptr - replacement); |
1006 | goto EXIT; |
1007 | } |
1008 | |
1009 | /* End of pcre2_substitute.c */ |
1010 | |