1 | /************************************************* |
2 | * Perl-Compatible Regular Expressions * |
3 | *************************************************/ |
4 | |
5 | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | and semantics are as close as possible to those of the Perl 5 language. |
7 | |
8 | Written by Philip Hazel |
9 | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | New API code Copyright (c) 2016-2020 University of Cambridge |
11 | |
12 | ----------------------------------------------------------------------------- |
13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions are met: |
15 | |
16 | * Redistributions of source code must retain the above copyright notice, |
17 | this list of conditions and the following disclaimer. |
18 | |
19 | * Redistributions in binary form must reproduce the above copyright |
20 | notice, this list of conditions and the following disclaimer in the |
21 | documentation and/or other materials provided with the distribution. |
22 | |
23 | * Neither the name of the University of Cambridge nor the names of its |
24 | contributors may be used to endorse or promote products derived from |
25 | this software without specific prior written permission. |
26 | |
27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | POSSIBILITY OF SUCH DAMAGE. |
38 | ----------------------------------------------------------------------------- |
39 | */ |
40 | |
41 | |
42 | #ifdef HAVE_CONFIG_H |
43 | #include "config.h" |
44 | #endif |
45 | |
46 | #include "pcre2_internal.h" |
47 | |
48 | #define PTR_STACK_SIZE 20 |
49 | |
50 | #define SUBSTITUTE_OPTIONS \ |
51 | (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \ |
52 | PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \ |
53 | PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \ |
54 | PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY) |
55 | |
56 | |
57 | |
58 | /************************************************* |
59 | * Find end of substitute text * |
60 | *************************************************/ |
61 | |
62 | /* In extended mode, we recognize ${name:+set text:unset text} and similar |
63 | constructions. This requires the identification of unescaped : and } |
64 | characters. This function scans for such. It must deal with nested ${ |
65 | constructions. The pointer to the text is updated, either to the required end |
66 | character, or to where an error was detected. |
67 | |
68 | Arguments: |
69 | code points to the compiled expression (for options) |
70 | ptrptr points to the pointer to the start of the text (updated) |
71 | ptrend end of the whole string |
72 | last TRUE if the last expected string (only } recognized) |
73 | |
74 | Returns: 0 on success |
75 | negative error code on failure |
76 | */ |
77 | |
78 | static int |
79 | find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, |
80 | BOOL last) |
81 | { |
82 | int rc = 0; |
83 | uint32_t nestlevel = 0; |
84 | BOOL literal = FALSE; |
85 | PCRE2_SPTR ptr = *ptrptr; |
86 | |
87 | for (; ptr < ptrend; ptr++) |
88 | { |
89 | if (literal) |
90 | { |
91 | if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E) |
92 | { |
93 | literal = FALSE; |
94 | ptr += 1; |
95 | } |
96 | } |
97 | |
98 | else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) |
99 | { |
100 | if (nestlevel == 0) goto EXIT; |
101 | nestlevel--; |
102 | } |
103 | |
104 | else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT; |
105 | |
106 | else if (*ptr == CHAR_DOLLAR_SIGN) |
107 | { |
108 | if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET) |
109 | { |
110 | nestlevel++; |
111 | ptr += 1; |
112 | } |
113 | } |
114 | |
115 | else if (*ptr == CHAR_BACKSLASH) |
116 | { |
117 | int erc; |
118 | int errorcode; |
119 | uint32_t ch; |
120 | |
121 | if (ptr < ptrend - 1) switch (ptr[1]) |
122 | { |
123 | case CHAR_L: |
124 | case CHAR_l: |
125 | case CHAR_U: |
126 | case CHAR_u: |
127 | ptr += 1; |
128 | continue; |
129 | } |
130 | |
131 | ptr += 1; /* Must point after \ */ |
132 | erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode, |
133 | code->overall_options, code->extra_options, FALSE, NULL); |
134 | ptr -= 1; /* Back to last code unit of escape */ |
135 | if (errorcode != 0) |
136 | { |
137 | rc = errorcode; |
138 | goto EXIT; |
139 | } |
140 | |
141 | switch(erc) |
142 | { |
143 | case 0: /* Data character */ |
144 | case ESC_E: /* Isolated \E is ignored */ |
145 | break; |
146 | |
147 | case ESC_Q: |
148 | literal = TRUE; |
149 | break; |
150 | |
151 | default: |
152 | rc = PCRE2_ERROR_BADREPESCAPE; |
153 | goto EXIT; |
154 | } |
155 | } |
156 | } |
157 | |
158 | rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */ |
159 | |
160 | EXIT: |
161 | *ptrptr = ptr; |
162 | return rc; |
163 | } |
164 | |
165 | |
166 | |
167 | /************************************************* |
168 | * Match and substitute * |
169 | *************************************************/ |
170 | |
171 | /* This function applies a compiled re to a subject string and creates a new |
172 | string with substitutions. The first 7 arguments are the same as for |
173 | pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED. |
174 | |
175 | Arguments: |
176 | code points to the compiled expression |
177 | subject points to the subject string |
178 | length length of subject string (may contain binary zeros) |
179 | start_offset where to start in the subject string |
180 | options option bits |
181 | match_data points to a match_data block, or is NULL |
182 | context points a PCRE2 context |
183 | replacement points to the replacement string |
184 | rlength length of replacement string |
185 | buffer where to put the substituted string |
186 | blength points to length of buffer; updated to length of string |
187 | |
188 | Returns: >= 0 number of substitutions made |
189 | < 0 an error code |
190 | PCRE2_ERROR_BADREPLACEMENT means invalid use of $ |
191 | */ |
192 | |
193 | /* This macro checks for space in the buffer before copying into it. On |
194 | overflow, either give an error immediately, or keep on, accumulating the |
195 | length. */ |
196 | |
197 | #define CHECKMEMCPY(from,length) \ |
198 | { \ |
199 | if (!overflowed && lengthleft < length) \ |
200 | { \ |
201 | if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \ |
202 | overflowed = TRUE; \ |
203 | extra_needed = length - lengthleft; \ |
204 | } \ |
205 | else if (overflowed) \ |
206 | { \ |
207 | extra_needed += length; \ |
208 | } \ |
209 | else \ |
210 | { \ |
211 | memcpy(buffer + buff_offset, from, CU2BYTES(length)); \ |
212 | buff_offset += length; \ |
213 | lengthleft -= length; \ |
214 | } \ |
215 | } |
216 | |
217 | /* Here's the function */ |
218 | |
219 | PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION |
220 | pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, |
221 | PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, |
222 | pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength, |
223 | PCRE2_UCHAR *buffer, PCRE2_SIZE *blength) |
224 | { |
225 | int rc; |
226 | int subs; |
227 | int forcecase = 0; |
228 | int forcecasereset = 0; |
229 | uint32_t ovector_count; |
230 | uint32_t goptions = 0; |
231 | uint32_t suboptions; |
232 | pcre2_match_data *internal_match_data = NULL; |
233 | BOOL escaped_literal = FALSE; |
234 | BOOL overflowed = FALSE; |
235 | BOOL use_existing_match; |
236 | BOOL replacement_only; |
237 | #ifdef SUPPORT_UNICODE |
238 | BOOL utf = (code->overall_options & PCRE2_UTF) != 0; |
239 | BOOL ucp = (code->overall_options & PCRE2_UCP) != 0; |
240 | #endif |
241 | PCRE2_UCHAR temp[6]; |
242 | PCRE2_SPTR ptr; |
243 | PCRE2_SPTR repend; |
244 | PCRE2_SIZE = 0; |
245 | PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength; |
246 | PCRE2_SIZE *ovector; |
247 | PCRE2_SIZE ovecsave[3]; |
248 | pcre2_substitute_callout_block scb; |
249 | |
250 | /* General initialization */ |
251 | |
252 | buff_offset = 0; |
253 | lengthleft = buff_length = *blength; |
254 | *blength = PCRE2_UNSET; |
255 | ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET; |
256 | |
257 | /* Partial matching is not valid. This must come after setting *blength to |
258 | PCRE2_UNSET, so as not to imply an offset in the replacement. */ |
259 | |
260 | if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0) |
261 | return PCRE2_ERROR_BADOPTION; |
262 | |
263 | /* Check for using a match that has already happened. Note that the subject |
264 | pointer in the match data may be NULL after a no-match. */ |
265 | |
266 | use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0); |
267 | replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0); |
268 | |
269 | /* If starting from an existing match, there must be an externally provided |
270 | match data block. We create an internal match_data block in two cases: (a) an |
271 | external one is not supplied (and we are not starting from an existing match); |
272 | (b) an existing match is to be used for the first substitution. In the latter |
273 | case, we copy the existing match into the internal block. This ensures that no |
274 | changes are made to the existing match data block. */ |
275 | |
276 | if (match_data == NULL) |
277 | { |
278 | pcre2_general_context *gcontext; |
279 | if (use_existing_match) return PCRE2_ERROR_NULL; |
280 | gcontext = (mcontext == NULL)? |
281 | (pcre2_general_context *)code : |
282 | (pcre2_general_context *)mcontext; |
283 | match_data = internal_match_data = |
284 | pcre2_match_data_create_from_pattern(code, gcontext); |
285 | if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; |
286 | } |
287 | |
288 | else if (use_existing_match) |
289 | { |
290 | pcre2_general_context *gcontext = (mcontext == NULL)? |
291 | (pcre2_general_context *)code : |
292 | (pcre2_general_context *)mcontext; |
293 | int pairs = (code->top_bracket + 1 < match_data->oveccount)? |
294 | code->top_bracket + 1 : match_data->oveccount; |
295 | internal_match_data = pcre2_match_data_create(match_data->oveccount, |
296 | gcontext); |
297 | if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; |
298 | memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector) |
299 | + 2*pairs*sizeof(PCRE2_SIZE)); |
300 | match_data = internal_match_data; |
301 | } |
302 | |
303 | /* Remember ovector details */ |
304 | |
305 | ovector = pcre2_get_ovector_pointer(match_data); |
306 | ovector_count = pcre2_get_ovector_count(match_data); |
307 | |
308 | /* Fixed things in the callout block */ |
309 | |
310 | scb.version = 0; |
311 | scb.input = subject; |
312 | scb.output = (PCRE2_SPTR)buffer; |
313 | scb.ovector = ovector; |
314 | |
315 | /* Find lengths of zero-terminated strings and the end of the replacement. */ |
316 | |
317 | if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject); |
318 | if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement); |
319 | repend = replacement + rlength; |
320 | |
321 | /* Check UTF replacement string if necessary. */ |
322 | |
323 | #ifdef SUPPORT_UNICODE |
324 | if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) |
325 | { |
326 | rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar)); |
327 | if (rc != 0) |
328 | { |
329 | match_data->leftchar = 0; |
330 | goto EXIT; |
331 | } |
332 | } |
333 | #endif /* SUPPORT_UNICODE */ |
334 | |
335 | /* Save the substitute options and remove them from the match options. */ |
336 | |
337 | suboptions = options & SUBSTITUTE_OPTIONS; |
338 | options &= ~SUBSTITUTE_OPTIONS; |
339 | |
340 | /* Error if the start match offset is greater than the length of the subject. */ |
341 | |
342 | if (start_offset > length) |
343 | { |
344 | match_data->leftchar = 0; |
345 | rc = PCRE2_ERROR_BADOFFSET; |
346 | goto EXIT; |
347 | } |
348 | |
349 | /* Copy up to the start offset, unless only the replacement is required. */ |
350 | |
351 | if (!replacement_only) CHECKMEMCPY(subject, start_offset); |
352 | |
353 | /* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first |
354 | match is taken from the match_data that was passed in. */ |
355 | |
356 | subs = 0; |
357 | do |
358 | { |
359 | PCRE2_SPTR ptrstack[PTR_STACK_SIZE]; |
360 | uint32_t ptrstackptr = 0; |
361 | |
362 | if (use_existing_match) |
363 | { |
364 | rc = match_data->rc; |
365 | use_existing_match = FALSE; |
366 | } |
367 | else rc = pcre2_match(code, subject, length, start_offset, options|goptions, |
368 | match_data, mcontext); |
369 | |
370 | #ifdef SUPPORT_UNICODE |
371 | if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */ |
372 | #endif |
373 | |
374 | /* Any error other than no match returns the error code. No match when not |
375 | doing the special after-empty-match global rematch, or when at the end of the |
376 | subject, breaks the global loop. Otherwise, advance the starting point by one |
377 | character, copying it to the output, and try again. */ |
378 | |
379 | if (rc < 0) |
380 | { |
381 | PCRE2_SIZE save_start; |
382 | |
383 | if (rc != PCRE2_ERROR_NOMATCH) goto EXIT; |
384 | if (goptions == 0 || start_offset >= length) break; |
385 | |
386 | /* Advance by one code point. Then, if CRLF is a valid newline sequence and |
387 | we have advanced into the middle of it, advance one more code point. In |
388 | other words, do not start in the middle of CRLF, even if CR and LF on their |
389 | own are valid newlines. */ |
390 | |
391 | save_start = start_offset++; |
392 | if (subject[start_offset-1] == CHAR_CR && |
393 | code->newline_convention != PCRE2_NEWLINE_CR && |
394 | code->newline_convention != PCRE2_NEWLINE_LF && |
395 | start_offset < length && |
396 | subject[start_offset] == CHAR_LF) |
397 | start_offset++; |
398 | |
399 | /* Otherwise, in UTF mode, advance past any secondary code points. */ |
400 | |
401 | else if ((code->overall_options & PCRE2_UTF) != 0) |
402 | { |
403 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
404 | while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80) |
405 | start_offset++; |
406 | #elif PCRE2_CODE_UNIT_WIDTH == 16 |
407 | while (start_offset < length && |
408 | (subject[start_offset] & 0xfc00) == 0xdc00) |
409 | start_offset++; |
410 | #endif |
411 | } |
412 | |
413 | /* Copy what we have advanced past (unless not required), reset the special |
414 | global options, and continue to the next match. */ |
415 | |
416 | fraglength = start_offset - save_start; |
417 | if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength); |
418 | goptions = 0; |
419 | continue; |
420 | } |
421 | |
422 | /* Handle a successful match. Matches that use \K to end before they start |
423 | or start before the current point in the subject are not supported. */ |
424 | |
425 | if (ovector[1] < ovector[0] || ovector[0] < start_offset) |
426 | { |
427 | rc = PCRE2_ERROR_BADSUBSPATTERN; |
428 | goto EXIT; |
429 | } |
430 | |
431 | /* Check for the same match as previous. This is legitimate after matching an |
432 | empty string that starts after the initial match offset. We have tried again |
433 | at the match point in case the pattern is one like /(?<=\G.)/ which can never |
434 | match at its starting point, so running the match achieves the bumpalong. If |
435 | we do get the same (null) match at the original match point, it isn't such a |
436 | pattern, so we now do the empty string magic. In all other cases, a repeat |
437 | match should never occur. */ |
438 | |
439 | if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1]) |
440 | { |
441 | if (ovector[0] == ovector[1] && ovecsave[2] != start_offset) |
442 | { |
443 | goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; |
444 | ovecsave[2] = start_offset; |
445 | continue; /* Back to the top of the loop */ |
446 | } |
447 | rc = PCRE2_ERROR_INTERNAL_DUPMATCH; |
448 | goto EXIT; |
449 | } |
450 | |
451 | /* Count substitutions with a paranoid check for integer overflow; surely no |
452 | real call to this function would ever hit this! */ |
453 | |
454 | if (subs == INT_MAX) |
455 | { |
456 | rc = PCRE2_ERROR_TOOMANYREPLACE; |
457 | goto EXIT; |
458 | } |
459 | subs++; |
460 | |
461 | /* Copy the text leading up to the match (unless not required), and remember |
462 | where the insert begins and how many ovector pairs are set. */ |
463 | |
464 | if (rc == 0) rc = ovector_count; |
465 | fraglength = ovector[0] - start_offset; |
466 | if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength); |
467 | scb.output_offsets[0] = buff_offset; |
468 | scb.oveccount = rc; |
469 | |
470 | /* Process the replacement string. If the entire replacement is literal, just |
471 | copy it with length check. */ |
472 | |
473 | ptr = replacement; |
474 | if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0) |
475 | { |
476 | CHECKMEMCPY(ptr, rlength); |
477 | } |
478 | |
479 | /* Within a non-literal replacement, which must be scanned character by |
480 | character, local literal mode can be set by \Q, but only in extended mode |
481 | when backslashes are being interpreted. In extended mode we must handle |
482 | nested substrings that are to be reprocessed. */ |
483 | |
484 | else for (;;) |
485 | { |
486 | uint32_t ch; |
487 | unsigned int chlen; |
488 | |
489 | /* If at the end of a nested substring, pop the stack. */ |
490 | |
491 | if (ptr >= repend) |
492 | { |
493 | if (ptrstackptr == 0) break; /* End of replacement string */ |
494 | repend = ptrstack[--ptrstackptr]; |
495 | ptr = ptrstack[--ptrstackptr]; |
496 | continue; |
497 | } |
498 | |
499 | /* Handle the next character */ |
500 | |
501 | if (escaped_literal) |
502 | { |
503 | if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E) |
504 | { |
505 | escaped_literal = FALSE; |
506 | ptr += 2; |
507 | continue; |
508 | } |
509 | goto LOADLITERAL; |
510 | } |
511 | |
512 | /* Not in literal mode. */ |
513 | |
514 | if (*ptr == CHAR_DOLLAR_SIGN) |
515 | { |
516 | int group, n; |
517 | uint32_t special = 0; |
518 | BOOL inparens; |
519 | BOOL star; |
520 | PCRE2_SIZE sublength; |
521 | PCRE2_SPTR text1_start = NULL; |
522 | PCRE2_SPTR text1_end = NULL; |
523 | PCRE2_SPTR text2_start = NULL; |
524 | PCRE2_SPTR text2_end = NULL; |
525 | PCRE2_UCHAR next; |
526 | PCRE2_UCHAR name[33]; |
527 | |
528 | if (++ptr >= repend) goto BAD; |
529 | if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL; |
530 | |
531 | group = -1; |
532 | n = 0; |
533 | inparens = FALSE; |
534 | star = FALSE; |
535 | |
536 | if (next == CHAR_LEFT_CURLY_BRACKET) |
537 | { |
538 | if (++ptr >= repend) goto BAD; |
539 | next = *ptr; |
540 | inparens = TRUE; |
541 | } |
542 | |
543 | if (next == CHAR_ASTERISK) |
544 | { |
545 | if (++ptr >= repend) goto BAD; |
546 | next = *ptr; |
547 | star = TRUE; |
548 | } |
549 | |
550 | if (!star && next >= CHAR_0 && next <= CHAR_9) |
551 | { |
552 | group = next - CHAR_0; |
553 | while (++ptr < repend) |
554 | { |
555 | next = *ptr; |
556 | if (next < CHAR_0 || next > CHAR_9) break; |
557 | group = group * 10 + next - CHAR_0; |
558 | |
559 | /* A check for a number greater than the hightest captured group |
560 | is sufficient here; no need for a separate overflow check. If unknown |
561 | groups are to be treated as unset, just skip over any remaining |
562 | digits and carry on. */ |
563 | |
564 | if (group > code->top_bracket) |
565 | { |
566 | if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) |
567 | { |
568 | while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9); |
569 | break; |
570 | } |
571 | else |
572 | { |
573 | rc = PCRE2_ERROR_NOSUBSTRING; |
574 | goto PTREXIT; |
575 | } |
576 | } |
577 | } |
578 | } |
579 | else |
580 | { |
581 | const uint8_t *ctypes = code->tables + ctypes_offset; |
582 | while (MAX_255(next) && (ctypes[next] & ctype_word) != 0) |
583 | { |
584 | name[n++] = next; |
585 | if (n > 32) goto BAD; |
586 | if (++ptr >= repend) break; |
587 | next = *ptr; |
588 | } |
589 | if (n == 0) goto BAD; |
590 | name[n] = 0; |
591 | } |
592 | |
593 | /* In extended mode we recognize ${name:+set text:unset text} and |
594 | ${name:-default text}. */ |
595 | |
596 | if (inparens) |
597 | { |
598 | if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && |
599 | !star && ptr < repend - 2 && next == CHAR_COLON) |
600 | { |
601 | special = *(++ptr); |
602 | if (special != CHAR_PLUS && special != CHAR_MINUS) |
603 | { |
604 | rc = PCRE2_ERROR_BADSUBSTITUTION; |
605 | goto PTREXIT; |
606 | } |
607 | |
608 | text1_start = ++ptr; |
609 | rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS); |
610 | if (rc != 0) goto PTREXIT; |
611 | text1_end = ptr; |
612 | |
613 | if (special == CHAR_PLUS && *ptr == CHAR_COLON) |
614 | { |
615 | text2_start = ++ptr; |
616 | rc = find_text_end(code, &ptr, repend, TRUE); |
617 | if (rc != 0) goto PTREXIT; |
618 | text2_end = ptr; |
619 | } |
620 | } |
621 | |
622 | else |
623 | { |
624 | if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET) |
625 | { |
626 | rc = PCRE2_ERROR_REPMISSINGBRACE; |
627 | goto PTREXIT; |
628 | } |
629 | } |
630 | |
631 | ptr++; |
632 | } |
633 | |
634 | /* Have found a syntactically correct group number or name, or *name. |
635 | Only *MARK is currently recognized. */ |
636 | |
637 | if (star) |
638 | { |
639 | if (PRIV(strcmp_c8)(name, STRING_MARK) == 0) |
640 | { |
641 | PCRE2_SPTR mark = pcre2_get_mark(match_data); |
642 | if (mark != NULL) |
643 | { |
644 | PCRE2_SPTR mark_start = mark; |
645 | while (*mark != 0) mark++; |
646 | fraglength = mark - mark_start; |
647 | CHECKMEMCPY(mark_start, fraglength); |
648 | } |
649 | } |
650 | else goto BAD; |
651 | } |
652 | |
653 | /* Substitute the contents of a group. We don't use substring_copy |
654 | functions any more, in order to support case forcing. */ |
655 | |
656 | else |
657 | { |
658 | PCRE2_SPTR subptr, subptrend; |
659 | |
660 | /* Find a number for a named group. In case there are duplicate names, |
661 | search for the first one that is set. If the name is not found when |
662 | PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a |
663 | non-existent group. */ |
664 | |
665 | if (group < 0) |
666 | { |
667 | PCRE2_SPTR first, last, entry; |
668 | rc = pcre2_substring_nametable_scan(code, name, &first, &last); |
669 | if (rc == PCRE2_ERROR_NOSUBSTRING && |
670 | (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) |
671 | { |
672 | group = code->top_bracket + 1; |
673 | } |
674 | else |
675 | { |
676 | if (rc < 0) goto PTREXIT; |
677 | for (entry = first; entry <= last; entry += rc) |
678 | { |
679 | uint32_t ng = GET2(entry, 0); |
680 | if (ng < ovector_count) |
681 | { |
682 | if (group < 0) group = ng; /* First in ovector */ |
683 | if (ovector[ng*2] != PCRE2_UNSET) |
684 | { |
685 | group = ng; /* First that is set */ |
686 | break; |
687 | } |
688 | } |
689 | } |
690 | |
691 | /* If group is still negative, it means we did not find a group |
692 | that is in the ovector. Just set the first group. */ |
693 | |
694 | if (group < 0) group = GET2(first, 0); |
695 | } |
696 | } |
697 | |
698 | /* We now have a group that is identified by number. Find the length of |
699 | the captured string. If a group in a non-special substitution is unset |
700 | when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */ |
701 | |
702 | rc = pcre2_substring_length_bynumber(match_data, group, &sublength); |
703 | if (rc < 0) |
704 | { |
705 | if (rc == PCRE2_ERROR_NOSUBSTRING && |
706 | (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) |
707 | { |
708 | rc = PCRE2_ERROR_UNSET; |
709 | } |
710 | if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */ |
711 | if (special == 0) /* Plain substitution */ |
712 | { |
713 | if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue; |
714 | goto PTREXIT; /* Else error */ |
715 | } |
716 | } |
717 | |
718 | /* If special is '+' we have a 'set' and possibly an 'unset' text, |
719 | both of which are reprocessed when used. If special is '-' we have a |
720 | default text for when the group is unset; it must be reprocessed. */ |
721 | |
722 | if (special != 0) |
723 | { |
724 | if (special == CHAR_MINUS) |
725 | { |
726 | if (rc == 0) goto LITERAL_SUBSTITUTE; |
727 | text2_start = text1_start; |
728 | text2_end = text1_end; |
729 | } |
730 | |
731 | if (ptrstackptr >= PTR_STACK_SIZE) goto BAD; |
732 | ptrstack[ptrstackptr++] = ptr; |
733 | ptrstack[ptrstackptr++] = repend; |
734 | |
735 | if (rc == 0) |
736 | { |
737 | ptr = text1_start; |
738 | repend = text1_end; |
739 | } |
740 | else |
741 | { |
742 | ptr = text2_start; |
743 | repend = text2_end; |
744 | } |
745 | continue; |
746 | } |
747 | |
748 | /* Otherwise we have a literal substitution of a group's contents. */ |
749 | |
750 | LITERAL_SUBSTITUTE: |
751 | subptr = subject + ovector[group*2]; |
752 | subptrend = subject + ovector[group*2 + 1]; |
753 | |
754 | /* Substitute a literal string, possibly forcing alphabetic case. */ |
755 | |
756 | while (subptr < subptrend) |
757 | { |
758 | GETCHARINCTEST(ch, subptr); |
759 | if (forcecase != 0) |
760 | { |
761 | #ifdef SUPPORT_UNICODE |
762 | if (utf || ucp) |
763 | { |
764 | uint32_t type = UCD_CHARTYPE(ch); |
765 | if (PRIV(ucp_gentype)[type] == ucp_L && |
766 | type != ((forcecase > 0)? ucp_Lu : ucp_Ll)) |
767 | ch = UCD_OTHERCASE(ch); |
768 | } |
769 | else |
770 | #endif |
771 | { |
772 | if (((code->tables + cbits_offset + |
773 | ((forcecase > 0)? cbit_upper:cbit_lower) |
774 | )[ch/8] & (1u << (ch%8))) == 0) |
775 | ch = (code->tables + fcc_offset)[ch]; |
776 | } |
777 | forcecase = forcecasereset; |
778 | } |
779 | |
780 | #ifdef SUPPORT_UNICODE |
781 | if (utf) chlen = PRIV(ord2utf)(ch, temp); else |
782 | #endif |
783 | { |
784 | temp[0] = ch; |
785 | chlen = 1; |
786 | } |
787 | CHECKMEMCPY(temp, chlen); |
788 | } |
789 | } |
790 | } |
791 | |
792 | /* Handle an escape sequence in extended mode. We can use check_escape() |
793 | to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but |
794 | the case-forcing escapes are not supported in pcre2_compile() so must be |
795 | recognized here. */ |
796 | |
797 | else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && |
798 | *ptr == CHAR_BACKSLASH) |
799 | { |
800 | int errorcode; |
801 | |
802 | if (ptr < repend - 1) switch (ptr[1]) |
803 | { |
804 | case CHAR_L: |
805 | forcecase = forcecasereset = -1; |
806 | ptr += 2; |
807 | continue; |
808 | |
809 | case CHAR_l: |
810 | forcecase = -1; |
811 | forcecasereset = 0; |
812 | ptr += 2; |
813 | continue; |
814 | |
815 | case CHAR_U: |
816 | forcecase = forcecasereset = 1; |
817 | ptr += 2; |
818 | continue; |
819 | |
820 | case CHAR_u: |
821 | forcecase = 1; |
822 | forcecasereset = 0; |
823 | ptr += 2; |
824 | continue; |
825 | |
826 | default: |
827 | break; |
828 | } |
829 | |
830 | ptr++; /* Point after \ */ |
831 | rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode, |
832 | code->overall_options, code->extra_options, FALSE, NULL); |
833 | if (errorcode != 0) goto BADESCAPE; |
834 | |
835 | switch(rc) |
836 | { |
837 | case ESC_E: |
838 | forcecase = forcecasereset = 0; |
839 | continue; |
840 | |
841 | case ESC_Q: |
842 | escaped_literal = TRUE; |
843 | continue; |
844 | |
845 | case 0: /* Data character */ |
846 | goto LITERAL; |
847 | |
848 | default: |
849 | goto BADESCAPE; |
850 | } |
851 | } |
852 | |
853 | /* Handle a literal code unit */ |
854 | |
855 | else |
856 | { |
857 | LOADLITERAL: |
858 | GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */ |
859 | |
860 | LITERAL: |
861 | if (forcecase != 0) |
862 | { |
863 | #ifdef SUPPORT_UNICODE |
864 | if (utf || ucp) |
865 | { |
866 | uint32_t type = UCD_CHARTYPE(ch); |
867 | if (PRIV(ucp_gentype)[type] == ucp_L && |
868 | type != ((forcecase > 0)? ucp_Lu : ucp_Ll)) |
869 | ch = UCD_OTHERCASE(ch); |
870 | } |
871 | else |
872 | #endif |
873 | { |
874 | if (((code->tables + cbits_offset + |
875 | ((forcecase > 0)? cbit_upper:cbit_lower) |
876 | )[ch/8] & (1u << (ch%8))) == 0) |
877 | ch = (code->tables + fcc_offset)[ch]; |
878 | } |
879 | forcecase = forcecasereset; |
880 | } |
881 | |
882 | #ifdef SUPPORT_UNICODE |
883 | if (utf) chlen = PRIV(ord2utf)(ch, temp); else |
884 | #endif |
885 | { |
886 | temp[0] = ch; |
887 | chlen = 1; |
888 | } |
889 | CHECKMEMCPY(temp, chlen); |
890 | } /* End handling a literal code unit */ |
891 | } /* End of loop for scanning the replacement. */ |
892 | |
893 | /* The replacement has been copied to the output, or its size has been |
894 | remembered. Do the callout if there is one and we have done an actual |
895 | replacement. */ |
896 | |
897 | if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL) |
898 | { |
899 | scb.subscount = subs; |
900 | scb.output_offsets[1] = buff_offset; |
901 | rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data); |
902 | |
903 | /* A non-zero return means cancel this substitution. Instead, copy the |
904 | matched string fragment. */ |
905 | |
906 | if (rc != 0) |
907 | { |
908 | PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0]; |
909 | PCRE2_SIZE oldlength = ovector[1] - ovector[0]; |
910 | |
911 | buff_offset -= newlength; |
912 | lengthleft += newlength; |
913 | if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength); |
914 | |
915 | /* A negative return means do not do any more. */ |
916 | |
917 | if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL); |
918 | } |
919 | } |
920 | |
921 | /* Save the details of this match. See above for how this data is used. If we |
922 | matched an empty string, do the magic for global matches. Update the start |
923 | offset to point to the rest of the subject string. If we re-used an existing |
924 | match for the first match, switch to the internal match data block. */ |
925 | |
926 | ovecsave[0] = ovector[0]; |
927 | ovecsave[1] = ovector[1]; |
928 | ovecsave[2] = start_offset; |
929 | |
930 | goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 : |
931 | PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART; |
932 | start_offset = ovector[1]; |
933 | } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */ |
934 | |
935 | /* Copy the rest of the subject unless not required, and terminate the output |
936 | with a binary zero. */ |
937 | |
938 | if (!replacement_only) |
939 | { |
940 | fraglength = length - start_offset; |
941 | CHECKMEMCPY(subject + start_offset, fraglength); |
942 | } |
943 | |
944 | temp[0] = 0; |
945 | CHECKMEMCPY(temp, 1); |
946 | |
947 | /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set, |
948 | and matching has carried on after a full buffer, in order to compute the length |
949 | needed. Otherwise, an overflow generates an immediate error return. */ |
950 | |
951 | if (overflowed) |
952 | { |
953 | rc = PCRE2_ERROR_NOMEMORY; |
954 | *blength = buff_length + extra_needed; |
955 | } |
956 | |
957 | /* After a successful execution, return the number of substitutions and set the |
958 | length of buffer used, excluding the trailing zero. */ |
959 | |
960 | else |
961 | { |
962 | rc = subs; |
963 | *blength = buff_offset - 1; |
964 | } |
965 | |
966 | EXIT: |
967 | if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data); |
968 | else match_data->rc = rc; |
969 | return rc; |
970 | |
971 | NOROOM: |
972 | rc = PCRE2_ERROR_NOMEMORY; |
973 | goto EXIT; |
974 | |
975 | BAD: |
976 | rc = PCRE2_ERROR_BADREPLACEMENT; |
977 | goto PTREXIT; |
978 | |
979 | BADESCAPE: |
980 | rc = PCRE2_ERROR_BADREPESCAPE; |
981 | |
982 | PTREXIT: |
983 | *blength = (PCRE2_SIZE)(ptr - replacement); |
984 | goto EXIT; |
985 | } |
986 | |
987 | /* End of pcre2_substitute.c */ |
988 | |