1 | /************************************************* |
2 | * Perl-Compatible Regular Expressions * |
3 | *************************************************/ |
4 | |
5 | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | and semantics are as close as possible to those of the Perl 5 language. |
7 | |
8 | Written by Philip Hazel |
9 | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | New API code Copyright (c) 2016-2022 University of Cambridge |
11 | |
12 | ----------------------------------------------------------------------------- |
13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions are met: |
15 | |
16 | * Redistributions of source code must retain the above copyright notice, |
17 | this list of conditions and the following disclaimer. |
18 | |
19 | * Redistributions in binary form must reproduce the above copyright |
20 | notice, this list of conditions and the following disclaimer in the |
21 | documentation and/or other materials provided with the distribution. |
22 | |
23 | * Neither the name of the University of Cambridge nor the names of its |
24 | contributors may be used to endorse or promote products derived from |
25 | this software without specific prior written permission. |
26 | |
27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | POSSIBILITY OF SUCH DAMAGE. |
38 | ----------------------------------------------------------------------------- |
39 | */ |
40 | |
41 | |
42 | #ifdef HAVE_CONFIG_H |
43 | #include "config.h" |
44 | #endif |
45 | |
46 | #include "pcre2_internal.h" |
47 | |
48 | #define TYPE_OPTIONS (PCRE2_CONVERT_GLOB| \ |
49 | PCRE2_CONVERT_POSIX_BASIC|PCRE2_CONVERT_POSIX_EXTENDED) |
50 | |
51 | #define ALL_OPTIONS (PCRE2_CONVERT_UTF|PCRE2_CONVERT_NO_UTF_CHECK| \ |
52 | PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR| \ |
53 | PCRE2_CONVERT_GLOB_NO_STARSTAR| \ |
54 | TYPE_OPTIONS) |
55 | |
56 | #define DUMMY_BUFFER_SIZE 100 |
57 | |
58 | /* Generated pattern fragments */ |
59 | |
60 | #define STR_BACKSLASH_A STR_BACKSLASH STR_A |
61 | #define STR_BACKSLASH_z STR_BACKSLASH STR_z |
62 | #define STR_COLON_RIGHT_SQUARE_BRACKET STR_COLON STR_RIGHT_SQUARE_BRACKET |
63 | #define STR_DOT_STAR_LOOKBEHIND STR_DOT STR_ASTERISK STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_LESS_THAN_SIGN STR_EQUALS_SIGN |
64 | #define STR_LOOKAHEAD_NOT_DOT STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_EXCLAMATION_MARK STR_BACKSLASH STR_DOT STR_RIGHT_PARENTHESIS |
65 | #define STR_QUERY_s STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_s STR_RIGHT_PARENTHESIS |
66 | #define STR_STAR_NUL STR_LEFT_PARENTHESIS STR_ASTERISK STR_N STR_U STR_L STR_RIGHT_PARENTHESIS |
67 | |
68 | /* States for POSIX processing */ |
69 | |
70 | enum { POSIX_START_REGEX, POSIX_ANCHORED, POSIX_NOT_BRACKET, |
71 | POSIX_CLASS_NOT_STARTED, POSIX_CLASS_STARTING, POSIX_CLASS_STARTED }; |
72 | |
73 | /* Macro to add a character string to the output buffer, checking for overflow. */ |
74 | |
75 | #define PUTCHARS(string) \ |
76 | { \ |
77 | for (s = (char *)(string); *s != 0; s++) \ |
78 | { \ |
79 | if (p >= endp) return PCRE2_ERROR_NOMEMORY; \ |
80 | *p++ = *s; \ |
81 | } \ |
82 | } |
83 | |
84 | /* Literals that must be escaped: \ ? * + | . ^ $ { } [ ] ( ) */ |
85 | |
86 | static const char *pcre2_escaped_literals = |
87 | STR_BACKSLASH STR_QUESTION_MARK STR_ASTERISK STR_PLUS |
88 | STR_VERTICAL_LINE STR_DOT STR_CIRCUMFLEX_ACCENT STR_DOLLAR_SIGN |
89 | STR_LEFT_CURLY_BRACKET STR_RIGHT_CURLY_BRACKET |
90 | STR_LEFT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET |
91 | STR_LEFT_PARENTHESIS STR_RIGHT_PARENTHESIS; |
92 | |
93 | /* Recognized escaped metacharacters in POSIX basic patterns. */ |
94 | |
95 | static const char *posix_meta_escapes = |
96 | STR_LEFT_PARENTHESIS STR_RIGHT_PARENTHESIS |
97 | STR_LEFT_CURLY_BRACKET STR_RIGHT_CURLY_BRACKET |
98 | STR_1 STR_2 STR_3 STR_4 STR_5 STR_6 STR_7 STR_8 STR_9; |
99 | |
100 | |
101 | |
102 | /************************************************* |
103 | * Convert a POSIX pattern * |
104 | *************************************************/ |
105 | |
106 | /* This function handles both basic and extended POSIX patterns. |
107 | |
108 | Arguments: |
109 | pattype the pattern type |
110 | pattern the pattern |
111 | plength length in code units |
112 | utf TRUE if UTF |
113 | use_buffer where to put the output |
114 | use_length length of use_buffer |
115 | bufflenptr where to put the used length |
116 | dummyrun TRUE if a dummy run |
117 | ccontext the convert context |
118 | |
119 | Returns: 0 => success |
120 | !0 => error code |
121 | */ |
122 | |
123 | static int |
124 | convert_posix(uint32_t pattype, PCRE2_SPTR pattern, PCRE2_SIZE plength, |
125 | BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length, |
126 | PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext) |
127 | { |
128 | char *s; |
129 | PCRE2_SPTR posix = pattern; |
130 | PCRE2_UCHAR *p = use_buffer; |
131 | PCRE2_UCHAR *pp = p; |
132 | PCRE2_UCHAR *endp = p + use_length - 1; /* Allow for trailing zero */ |
133 | PCRE2_SIZE convlength = 0; |
134 | |
135 | uint32_t bracount = 0; |
136 | uint32_t posix_state = POSIX_START_REGEX; |
137 | uint32_t lastspecial = 0; |
138 | BOOL extended = (pattype & PCRE2_CONVERT_POSIX_EXTENDED) != 0; |
139 | BOOL nextisliteral = FALSE; |
140 | |
141 | (void)utf; /* Not used when Unicode not supported */ |
142 | (void)ccontext; /* Not currently used */ |
143 | |
144 | /* Initialize default for error offset as end of input. */ |
145 | |
146 | *bufflenptr = plength; |
147 | PUTCHARS(STR_STAR_NUL); |
148 | |
149 | /* Now scan the input. */ |
150 | |
151 | while (plength > 0) |
152 | { |
153 | uint32_t c, sc; |
154 | int clength = 1; |
155 | |
156 | /* Add in the length of the last item, then, if in the dummy run, pull the |
157 | pointer back to the start of the (temporary) buffer and then remember the |
158 | start of the next item. */ |
159 | |
160 | convlength += p - pp; |
161 | if (dummyrun) p = use_buffer; |
162 | pp = p; |
163 | |
164 | /* Pick up the next character */ |
165 | |
166 | #ifndef SUPPORT_UNICODE |
167 | c = *posix; |
168 | #else |
169 | GETCHARLENTEST(c, posix, clength); |
170 | #endif |
171 | posix += clength; |
172 | plength -= clength; |
173 | |
174 | sc = nextisliteral? 0 : c; |
175 | nextisliteral = FALSE; |
176 | |
177 | /* Handle a character within a class. */ |
178 | |
179 | if (posix_state >= POSIX_CLASS_NOT_STARTED) |
180 | { |
181 | if (c == CHAR_RIGHT_SQUARE_BRACKET) |
182 | { |
183 | PUTCHARS(STR_RIGHT_SQUARE_BRACKET); |
184 | posix_state = POSIX_NOT_BRACKET; |
185 | } |
186 | |
187 | /* Not the end of the class */ |
188 | |
189 | else |
190 | { |
191 | switch (posix_state) |
192 | { |
193 | case POSIX_CLASS_STARTED: |
194 | if (c <= 127 && islower(c)) break; /* Remain in started state */ |
195 | posix_state = POSIX_CLASS_NOT_STARTED; |
196 | if (c == CHAR_COLON && plength > 0 && |
197 | *posix == CHAR_RIGHT_SQUARE_BRACKET) |
198 | { |
199 | PUTCHARS(STR_COLON_RIGHT_SQUARE_BRACKET); |
200 | plength--; |
201 | posix++; |
202 | continue; /* With next character after :] */ |
203 | } |
204 | /* Fall through */ |
205 | |
206 | case POSIX_CLASS_NOT_STARTED: |
207 | if (c == CHAR_LEFT_SQUARE_BRACKET) |
208 | posix_state = POSIX_CLASS_STARTING; |
209 | break; |
210 | |
211 | case POSIX_CLASS_STARTING: |
212 | if (c == CHAR_COLON) posix_state = POSIX_CLASS_STARTED; |
213 | break; |
214 | } |
215 | |
216 | if (c == CHAR_BACKSLASH) PUTCHARS(STR_BACKSLASH); |
217 | if (p + clength > endp) return PCRE2_ERROR_NOMEMORY; |
218 | memcpy(p, posix - clength, CU2BYTES(clength)); |
219 | p += clength; |
220 | } |
221 | } |
222 | |
223 | /* Handle a character not within a class. */ |
224 | |
225 | else switch(sc) |
226 | { |
227 | case CHAR_LEFT_SQUARE_BRACKET: |
228 | PUTCHARS(STR_LEFT_SQUARE_BRACKET); |
229 | |
230 | #ifdef NEVER |
231 | /* We could handle special cases [[:<:]] and [[:>:]] (which PCRE does |
232 | support) but they are not part of POSIX 1003.1. */ |
233 | |
234 | if (plength >= 6) |
235 | { |
236 | if (posix[0] == CHAR_LEFT_SQUARE_BRACKET && |
237 | posix[1] == CHAR_COLON && |
238 | (posix[2] == CHAR_LESS_THAN_SIGN || |
239 | posix[2] == CHAR_GREATER_THAN_SIGN) && |
240 | posix[3] == CHAR_COLON && |
241 | posix[4] == CHAR_RIGHT_SQUARE_BRACKET && |
242 | posix[5] == CHAR_RIGHT_SQUARE_BRACKET) |
243 | { |
244 | if (p + 6 > endp) return PCRE2_ERROR_NOMEMORY; |
245 | memcpy(p, posix, CU2BYTES(6)); |
246 | p += 6; |
247 | posix += 6; |
248 | plength -= 6; |
249 | continue; /* With next character */ |
250 | } |
251 | } |
252 | #endif |
253 | |
254 | /* Handle start of "normal" character classes */ |
255 | |
256 | posix_state = POSIX_CLASS_NOT_STARTED; |
257 | |
258 | /* Handle ^ and ] as first characters */ |
259 | |
260 | if (plength > 0) |
261 | { |
262 | if (*posix == CHAR_CIRCUMFLEX_ACCENT) |
263 | { |
264 | posix++; |
265 | plength--; |
266 | PUTCHARS(STR_CIRCUMFLEX_ACCENT); |
267 | } |
268 | if (plength > 0 && *posix == CHAR_RIGHT_SQUARE_BRACKET) |
269 | { |
270 | posix++; |
271 | plength--; |
272 | PUTCHARS(STR_RIGHT_SQUARE_BRACKET); |
273 | } |
274 | } |
275 | break; |
276 | |
277 | case CHAR_BACKSLASH: |
278 | if (plength == 0) return PCRE2_ERROR_END_BACKSLASH; |
279 | if (extended) nextisliteral = TRUE; else |
280 | { |
281 | if (*posix < 127 && strchr(posix_meta_escapes, *posix) != NULL) |
282 | { |
283 | if (isdigit(*posix)) PUTCHARS(STR_BACKSLASH); |
284 | if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY; |
285 | lastspecial = *p++ = *posix++; |
286 | plength--; |
287 | } |
288 | else nextisliteral = TRUE; |
289 | } |
290 | break; |
291 | |
292 | case CHAR_RIGHT_PARENTHESIS: |
293 | if (!extended || bracount == 0) goto ESCAPE_LITERAL; |
294 | bracount--; |
295 | goto COPY_SPECIAL; |
296 | |
297 | case CHAR_LEFT_PARENTHESIS: |
298 | bracount++; |
299 | /* Fall through */ |
300 | |
301 | case CHAR_QUESTION_MARK: |
302 | case CHAR_PLUS: |
303 | case CHAR_LEFT_CURLY_BRACKET: |
304 | case CHAR_RIGHT_CURLY_BRACKET: |
305 | case CHAR_VERTICAL_LINE: |
306 | if (!extended) goto ESCAPE_LITERAL; |
307 | /* Fall through */ |
308 | |
309 | case CHAR_DOT: |
310 | case CHAR_DOLLAR_SIGN: |
311 | posix_state = POSIX_NOT_BRACKET; |
312 | COPY_SPECIAL: |
313 | lastspecial = c; |
314 | if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY; |
315 | *p++ = c; |
316 | break; |
317 | |
318 | case CHAR_ASTERISK: |
319 | if (lastspecial != CHAR_ASTERISK) |
320 | { |
321 | if (!extended && (posix_state < POSIX_NOT_BRACKET || |
322 | lastspecial == CHAR_LEFT_PARENTHESIS)) |
323 | goto ESCAPE_LITERAL; |
324 | goto COPY_SPECIAL; |
325 | } |
326 | break; /* Ignore second and subsequent asterisks */ |
327 | |
328 | case CHAR_CIRCUMFLEX_ACCENT: |
329 | if (extended) goto COPY_SPECIAL; |
330 | if (posix_state == POSIX_START_REGEX || |
331 | lastspecial == CHAR_LEFT_PARENTHESIS) |
332 | { |
333 | posix_state = POSIX_ANCHORED; |
334 | goto COPY_SPECIAL; |
335 | } |
336 | /* Fall through */ |
337 | |
338 | default: |
339 | if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL) |
340 | { |
341 | ESCAPE_LITERAL: |
342 | PUTCHARS(STR_BACKSLASH); |
343 | } |
344 | lastspecial = 0xff; /* Indicates nothing special */ |
345 | if (p + clength > endp) return PCRE2_ERROR_NOMEMORY; |
346 | memcpy(p, posix - clength, CU2BYTES(clength)); |
347 | p += clength; |
348 | posix_state = POSIX_NOT_BRACKET; |
349 | break; |
350 | } |
351 | } |
352 | |
353 | if (posix_state >= POSIX_CLASS_NOT_STARTED) |
354 | return PCRE2_ERROR_MISSING_SQUARE_BRACKET; |
355 | convlength += p - pp; /* Final segment */ |
356 | *bufflenptr = convlength; |
357 | *p++ = 0; |
358 | return 0; |
359 | } |
360 | |
361 | |
362 | /************************************************* |
363 | * Convert a glob pattern * |
364 | *************************************************/ |
365 | |
366 | /* Context for writing the output into a buffer. */ |
367 | |
368 | typedef struct pcre2_output_context { |
369 | PCRE2_UCHAR *output; /* current output position */ |
370 | PCRE2_SPTR output_end; /* output end */ |
371 | PCRE2_SIZE output_size; /* size of the output */ |
372 | uint8_t out_str[8]; /* string copied to the output */ |
373 | } pcre2_output_context; |
374 | |
375 | |
376 | /* Write a character into the output. |
377 | |
378 | Arguments: |
379 | out output context |
380 | chr the next character |
381 | */ |
382 | |
383 | static void |
384 | convert_glob_write(pcre2_output_context *out, PCRE2_UCHAR chr) |
385 | { |
386 | out->output_size++; |
387 | |
388 | if (out->output < out->output_end) |
389 | *out->output++ = chr; |
390 | } |
391 | |
392 | |
393 | /* Write a string into the output. |
394 | |
395 | Arguments: |
396 | out output context |
397 | length length of out->out_str |
398 | */ |
399 | |
400 | static void |
401 | convert_glob_write_str(pcre2_output_context *out, PCRE2_SIZE length) |
402 | { |
403 | uint8_t *out_str = out->out_str; |
404 | PCRE2_UCHAR *output = out->output; |
405 | PCRE2_SPTR output_end = out->output_end; |
406 | PCRE2_SIZE output_size = out->output_size; |
407 | |
408 | do |
409 | { |
410 | output_size++; |
411 | |
412 | if (output < output_end) |
413 | *output++ = *out_str++; |
414 | } |
415 | while (--length != 0); |
416 | |
417 | out->output = output; |
418 | out->output_size = output_size; |
419 | } |
420 | |
421 | |
422 | /* Prints the separator into the output. |
423 | |
424 | Arguments: |
425 | out output context |
426 | separator glob separator |
427 | with_escape backslash is needed before separator |
428 | */ |
429 | |
430 | static void |
431 | convert_glob_print_separator(pcre2_output_context *out, |
432 | PCRE2_UCHAR separator, BOOL with_escape) |
433 | { |
434 | if (with_escape) |
435 | convert_glob_write(out, CHAR_BACKSLASH); |
436 | |
437 | convert_glob_write(out, separator); |
438 | } |
439 | |
440 | |
441 | /* Prints a wildcard into the output. |
442 | |
443 | Arguments: |
444 | out output context |
445 | separator glob separator |
446 | with_escape backslash is needed before separator |
447 | */ |
448 | |
449 | static void |
450 | convert_glob_print_wildcard(pcre2_output_context *out, |
451 | PCRE2_UCHAR separator, BOOL with_escape) |
452 | { |
453 | out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET; |
454 | out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT; |
455 | convert_glob_write_str(out, 2); |
456 | |
457 | convert_glob_print_separator(out, separator, with_escape); |
458 | |
459 | convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET); |
460 | } |
461 | |
462 | |
463 | /* Parse a posix class. |
464 | |
465 | Arguments: |
466 | from starting point of scanning the range |
467 | pattern_end end of pattern |
468 | out output context |
469 | |
470 | Returns: >0 => class index |
471 | 0 => malformed class |
472 | */ |
473 | |
474 | static int |
475 | convert_glob_parse_class(PCRE2_SPTR *from, PCRE2_SPTR pattern_end, |
476 | pcre2_output_context *out) |
477 | { |
478 | static const char *posix_classes = "alnum:alpha:ascii:blank:cntrl:digit:" |
479 | "graph:lower:print:punct:space:upper:word:xdigit:" ; |
480 | PCRE2_SPTR start = *from + 1; |
481 | PCRE2_SPTR pattern = start; |
482 | const char *class_ptr; |
483 | PCRE2_UCHAR c; |
484 | int class_index; |
485 | |
486 | while (TRUE) |
487 | { |
488 | if (pattern >= pattern_end) return 0; |
489 | |
490 | c = *pattern++; |
491 | |
492 | if (c < CHAR_a || c > CHAR_z) break; |
493 | } |
494 | |
495 | if (c != CHAR_COLON || pattern >= pattern_end || |
496 | *pattern != CHAR_RIGHT_SQUARE_BRACKET) |
497 | return 0; |
498 | |
499 | class_ptr = posix_classes; |
500 | class_index = 1; |
501 | |
502 | while (TRUE) |
503 | { |
504 | if (*class_ptr == CHAR_NUL) return 0; |
505 | |
506 | pattern = start; |
507 | |
508 | while (*pattern == (PCRE2_UCHAR) *class_ptr) |
509 | { |
510 | if (*pattern == CHAR_COLON) |
511 | { |
512 | pattern += 2; |
513 | start -= 2; |
514 | |
515 | do convert_glob_write(out, *start++); while (start < pattern); |
516 | |
517 | *from = pattern; |
518 | return class_index; |
519 | } |
520 | pattern++; |
521 | class_ptr++; |
522 | } |
523 | |
524 | while (*class_ptr != CHAR_COLON) class_ptr++; |
525 | class_ptr++; |
526 | class_index++; |
527 | } |
528 | } |
529 | |
530 | /* Checks whether the character is in the class. |
531 | |
532 | Arguments: |
533 | class_index class index |
534 | c character |
535 | |
536 | Returns: !0 => character is found in the class |
537 | 0 => otherwise |
538 | */ |
539 | |
540 | static BOOL |
541 | convert_glob_char_in_class(int class_index, PCRE2_UCHAR c) |
542 | { |
543 | switch (class_index) |
544 | { |
545 | case 1: return isalnum(c); |
546 | case 2: return isalpha(c); |
547 | case 3: return 1; |
548 | case 4: return c == CHAR_HT || c == CHAR_SPACE; |
549 | case 5: return iscntrl(c); |
550 | case 6: return isdigit(c); |
551 | case 7: return isgraph(c); |
552 | case 8: return islower(c); |
553 | case 9: return isprint(c); |
554 | case 10: return ispunct(c); |
555 | case 11: return isspace(c); |
556 | case 12: return isupper(c); |
557 | case 13: return isalnum(c) || c == CHAR_UNDERSCORE; |
558 | default: return isxdigit(c); |
559 | } |
560 | } |
561 | |
562 | /* Parse a range of characters. |
563 | |
564 | Arguments: |
565 | from starting point of scanning the range |
566 | pattern_end end of pattern |
567 | out output context |
568 | separator glob separator |
569 | with_escape backslash is needed before separator |
570 | |
571 | Returns: 0 => success |
572 | !0 => error code |
573 | */ |
574 | |
575 | static int |
576 | convert_glob_parse_range(PCRE2_SPTR *from, PCRE2_SPTR pattern_end, |
577 | pcre2_output_context *out, BOOL utf, PCRE2_UCHAR separator, |
578 | BOOL with_escape, PCRE2_UCHAR escape, BOOL no_wildsep) |
579 | { |
580 | BOOL is_negative = FALSE; |
581 | BOOL separator_seen = FALSE; |
582 | BOOL has_prev_c; |
583 | PCRE2_SPTR pattern = *from; |
584 | PCRE2_SPTR char_start = NULL; |
585 | uint32_t c, prev_c; |
586 | int len, class_index; |
587 | |
588 | (void)utf; /* Avoid compiler warning. */ |
589 | |
590 | if (pattern >= pattern_end) |
591 | { |
592 | *from = pattern; |
593 | return PCRE2_ERROR_MISSING_SQUARE_BRACKET; |
594 | } |
595 | |
596 | if (*pattern == CHAR_EXCLAMATION_MARK |
597 | || *pattern == CHAR_CIRCUMFLEX_ACCENT) |
598 | { |
599 | pattern++; |
600 | |
601 | if (pattern >= pattern_end) |
602 | { |
603 | *from = pattern; |
604 | return PCRE2_ERROR_MISSING_SQUARE_BRACKET; |
605 | } |
606 | |
607 | is_negative = TRUE; |
608 | |
609 | out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET; |
610 | out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT; |
611 | len = 2; |
612 | |
613 | if (!no_wildsep) |
614 | { |
615 | if (with_escape) |
616 | { |
617 | out->out_str[len] = CHAR_BACKSLASH; |
618 | len++; |
619 | } |
620 | out->out_str[len] = (uint8_t) separator; |
621 | } |
622 | |
623 | convert_glob_write_str(out, len + 1); |
624 | } |
625 | else |
626 | convert_glob_write(out, CHAR_LEFT_SQUARE_BRACKET); |
627 | |
628 | has_prev_c = FALSE; |
629 | prev_c = 0; |
630 | |
631 | if (*pattern == CHAR_RIGHT_SQUARE_BRACKET) |
632 | { |
633 | out->out_str[0] = CHAR_BACKSLASH; |
634 | out->out_str[1] = CHAR_RIGHT_SQUARE_BRACKET; |
635 | convert_glob_write_str(out, 2); |
636 | has_prev_c = TRUE; |
637 | prev_c = CHAR_RIGHT_SQUARE_BRACKET; |
638 | pattern++; |
639 | } |
640 | |
641 | while (pattern < pattern_end) |
642 | { |
643 | char_start = pattern; |
644 | GETCHARINCTEST(c, pattern); |
645 | |
646 | if (c == CHAR_RIGHT_SQUARE_BRACKET) |
647 | { |
648 | convert_glob_write(out, c); |
649 | |
650 | if (!is_negative && !no_wildsep && separator_seen) |
651 | { |
652 | out->out_str[0] = CHAR_LEFT_PARENTHESIS; |
653 | out->out_str[1] = CHAR_QUESTION_MARK; |
654 | out->out_str[2] = CHAR_LESS_THAN_SIGN; |
655 | out->out_str[3] = CHAR_EXCLAMATION_MARK; |
656 | convert_glob_write_str(out, 4); |
657 | |
658 | convert_glob_print_separator(out, separator, with_escape); |
659 | convert_glob_write(out, CHAR_RIGHT_PARENTHESIS); |
660 | } |
661 | |
662 | *from = pattern; |
663 | return 0; |
664 | } |
665 | |
666 | if (pattern >= pattern_end) break; |
667 | |
668 | if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON) |
669 | { |
670 | *from = pattern; |
671 | class_index = convert_glob_parse_class(from, pattern_end, out); |
672 | |
673 | if (class_index != 0) |
674 | { |
675 | pattern = *from; |
676 | |
677 | has_prev_c = FALSE; |
678 | prev_c = 0; |
679 | |
680 | if (!is_negative && |
681 | convert_glob_char_in_class (class_index, separator)) |
682 | separator_seen = TRUE; |
683 | continue; |
684 | } |
685 | } |
686 | else if (c == CHAR_MINUS && has_prev_c && |
687 | *pattern != CHAR_RIGHT_SQUARE_BRACKET) |
688 | { |
689 | convert_glob_write(out, CHAR_MINUS); |
690 | |
691 | char_start = pattern; |
692 | GETCHARINCTEST(c, pattern); |
693 | |
694 | if (pattern >= pattern_end) break; |
695 | |
696 | if (escape != 0 && c == escape) |
697 | { |
698 | char_start = pattern; |
699 | GETCHARINCTEST(c, pattern); |
700 | } |
701 | else if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON) |
702 | { |
703 | *from = pattern; |
704 | return PCRE2_ERROR_CONVERT_SYNTAX; |
705 | } |
706 | |
707 | if (prev_c > c) |
708 | { |
709 | *from = pattern; |
710 | return PCRE2_ERROR_CONVERT_SYNTAX; |
711 | } |
712 | |
713 | if (prev_c < separator && separator < c) separator_seen = TRUE; |
714 | |
715 | has_prev_c = FALSE; |
716 | prev_c = 0; |
717 | } |
718 | else |
719 | { |
720 | if (escape != 0 && c == escape) |
721 | { |
722 | char_start = pattern; |
723 | GETCHARINCTEST(c, pattern); |
724 | |
725 | if (pattern >= pattern_end) break; |
726 | } |
727 | |
728 | has_prev_c = TRUE; |
729 | prev_c = c; |
730 | } |
731 | |
732 | if (c == CHAR_LEFT_SQUARE_BRACKET || c == CHAR_RIGHT_SQUARE_BRACKET || |
733 | c == CHAR_BACKSLASH || c == CHAR_MINUS) |
734 | convert_glob_write(out, CHAR_BACKSLASH); |
735 | |
736 | if (c == separator) separator_seen = TRUE; |
737 | |
738 | do convert_glob_write(out, *char_start++); while (char_start < pattern); |
739 | } |
740 | |
741 | *from = pattern; |
742 | return PCRE2_ERROR_MISSING_SQUARE_BRACKET; |
743 | } |
744 | |
745 | |
746 | /* Prints a (*COMMIT) into the output. |
747 | |
748 | Arguments: |
749 | out output context |
750 | */ |
751 | |
752 | static void |
753 | convert_glob_print_commit(pcre2_output_context *out) |
754 | { |
755 | out->out_str[0] = CHAR_LEFT_PARENTHESIS; |
756 | out->out_str[1] = CHAR_ASTERISK; |
757 | out->out_str[2] = CHAR_C; |
758 | out->out_str[3] = CHAR_O; |
759 | out->out_str[4] = CHAR_M; |
760 | out->out_str[5] = CHAR_M; |
761 | out->out_str[6] = CHAR_I; |
762 | out->out_str[7] = CHAR_T; |
763 | convert_glob_write_str(out, 8); |
764 | convert_glob_write(out, CHAR_RIGHT_PARENTHESIS); |
765 | } |
766 | |
767 | |
768 | /* Bash glob converter. |
769 | |
770 | Arguments: |
771 | pattype the pattern type |
772 | pattern the pattern |
773 | plength length in code units |
774 | utf TRUE if UTF |
775 | use_buffer where to put the output |
776 | use_length length of use_buffer |
777 | bufflenptr where to put the used length |
778 | dummyrun TRUE if a dummy run |
779 | ccontext the convert context |
780 | |
781 | Returns: 0 => success |
782 | !0 => error code |
783 | */ |
784 | |
785 | static int |
786 | convert_glob(uint32_t options, PCRE2_SPTR pattern, PCRE2_SIZE plength, |
787 | BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length, |
788 | PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext) |
789 | { |
790 | pcre2_output_context out; |
791 | PCRE2_SPTR pattern_start = pattern; |
792 | PCRE2_SPTR pattern_end = pattern + plength; |
793 | PCRE2_UCHAR separator = ccontext->glob_separator; |
794 | PCRE2_UCHAR escape = ccontext->glob_escape; |
795 | PCRE2_UCHAR c; |
796 | BOOL no_wildsep = (options & PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR) != 0; |
797 | BOOL no_starstar = (options & PCRE2_CONVERT_GLOB_NO_STARSTAR) != 0; |
798 | BOOL in_atomic = FALSE; |
799 | BOOL after_starstar = FALSE; |
800 | BOOL no_slash_z = FALSE; |
801 | BOOL with_escape, is_start, after_separator; |
802 | int result = 0; |
803 | |
804 | (void)utf; /* Avoid compiler warning. */ |
805 | |
806 | #ifdef SUPPORT_UNICODE |
807 | if (utf && (separator >= 128 || escape >= 128)) |
808 | { |
809 | /* Currently only ASCII characters are supported. */ |
810 | *bufflenptr = 0; |
811 | return PCRE2_ERROR_CONVERT_SYNTAX; |
812 | } |
813 | #endif |
814 | |
815 | with_escape = strchr(pcre2_escaped_literals, separator) != NULL; |
816 | |
817 | /* Initialize default for error offset as end of input. */ |
818 | out.output = use_buffer; |
819 | out.output_end = use_buffer + use_length; |
820 | out.output_size = 0; |
821 | |
822 | out.out_str[0] = CHAR_LEFT_PARENTHESIS; |
823 | out.out_str[1] = CHAR_QUESTION_MARK; |
824 | out.out_str[2] = CHAR_s; |
825 | out.out_str[3] = CHAR_RIGHT_PARENTHESIS; |
826 | convert_glob_write_str(&out, 4); |
827 | |
828 | is_start = TRUE; |
829 | |
830 | if (pattern < pattern_end && pattern[0] == CHAR_ASTERISK) |
831 | { |
832 | if (no_wildsep) |
833 | is_start = FALSE; |
834 | else if (!no_starstar && pattern + 1 < pattern_end && |
835 | pattern[1] == CHAR_ASTERISK) |
836 | is_start = FALSE; |
837 | } |
838 | |
839 | if (is_start) |
840 | { |
841 | out.out_str[0] = CHAR_BACKSLASH; |
842 | out.out_str[1] = CHAR_A; |
843 | convert_glob_write_str(&out, 2); |
844 | } |
845 | |
846 | while (pattern < pattern_end) |
847 | { |
848 | c = *pattern++; |
849 | |
850 | if (c == CHAR_ASTERISK) |
851 | { |
852 | is_start = pattern == pattern_start + 1; |
853 | |
854 | if (in_atomic) |
855 | { |
856 | convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS); |
857 | in_atomic = FALSE; |
858 | } |
859 | |
860 | if (!no_starstar && pattern < pattern_end && *pattern == CHAR_ASTERISK) |
861 | { |
862 | after_separator = is_start || (pattern[-2] == separator); |
863 | |
864 | do pattern++; while (pattern < pattern_end && |
865 | *pattern == CHAR_ASTERISK); |
866 | |
867 | if (pattern >= pattern_end) |
868 | { |
869 | no_slash_z = TRUE; |
870 | break; |
871 | } |
872 | |
873 | after_starstar = TRUE; |
874 | |
875 | if (after_separator && escape != 0 && *pattern == escape && |
876 | pattern + 1 < pattern_end && pattern[1] == separator) |
877 | pattern++; |
878 | |
879 | if (is_start) |
880 | { |
881 | if (*pattern != separator) continue; |
882 | |
883 | out.out_str[0] = CHAR_LEFT_PARENTHESIS; |
884 | out.out_str[1] = CHAR_QUESTION_MARK; |
885 | out.out_str[2] = CHAR_COLON; |
886 | out.out_str[3] = CHAR_BACKSLASH; |
887 | out.out_str[4] = CHAR_A; |
888 | out.out_str[5] = CHAR_VERTICAL_LINE; |
889 | convert_glob_write_str(&out, 6); |
890 | |
891 | convert_glob_print_separator(&out, separator, with_escape); |
892 | convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS); |
893 | |
894 | pattern++; |
895 | continue; |
896 | } |
897 | |
898 | convert_glob_print_commit(&out); |
899 | |
900 | if (!after_separator || *pattern != separator) |
901 | { |
902 | out.out_str[0] = CHAR_DOT; |
903 | out.out_str[1] = CHAR_ASTERISK; |
904 | out.out_str[2] = CHAR_QUESTION_MARK; |
905 | convert_glob_write_str(&out, 3); |
906 | continue; |
907 | } |
908 | |
909 | out.out_str[0] = CHAR_LEFT_PARENTHESIS; |
910 | out.out_str[1] = CHAR_QUESTION_MARK; |
911 | out.out_str[2] = CHAR_COLON; |
912 | out.out_str[3] = CHAR_DOT; |
913 | out.out_str[4] = CHAR_ASTERISK; |
914 | out.out_str[5] = CHAR_QUESTION_MARK; |
915 | |
916 | convert_glob_write_str(&out, 6); |
917 | |
918 | convert_glob_print_separator(&out, separator, with_escape); |
919 | |
920 | out.out_str[0] = CHAR_RIGHT_PARENTHESIS; |
921 | out.out_str[1] = CHAR_QUESTION_MARK; |
922 | out.out_str[2] = CHAR_QUESTION_MARK; |
923 | convert_glob_write_str(&out, 3); |
924 | |
925 | pattern++; |
926 | continue; |
927 | } |
928 | |
929 | if (pattern < pattern_end && *pattern == CHAR_ASTERISK) |
930 | { |
931 | do pattern++; while (pattern < pattern_end && |
932 | *pattern == CHAR_ASTERISK); |
933 | } |
934 | |
935 | if (no_wildsep) |
936 | { |
937 | if (pattern >= pattern_end) |
938 | { |
939 | no_slash_z = TRUE; |
940 | break; |
941 | } |
942 | |
943 | /* Start check must be after the end check. */ |
944 | if (is_start) continue; |
945 | } |
946 | |
947 | if (!is_start) |
948 | { |
949 | if (after_starstar) |
950 | { |
951 | out.out_str[0] = CHAR_LEFT_PARENTHESIS; |
952 | out.out_str[1] = CHAR_QUESTION_MARK; |
953 | out.out_str[2] = CHAR_GREATER_THAN_SIGN; |
954 | convert_glob_write_str(&out, 3); |
955 | in_atomic = TRUE; |
956 | } |
957 | else |
958 | convert_glob_print_commit(&out); |
959 | } |
960 | |
961 | if (no_wildsep) |
962 | convert_glob_write(&out, CHAR_DOT); |
963 | else |
964 | convert_glob_print_wildcard(&out, separator, with_escape); |
965 | |
966 | out.out_str[0] = CHAR_ASTERISK; |
967 | out.out_str[1] = CHAR_QUESTION_MARK; |
968 | if (pattern >= pattern_end) |
969 | out.out_str[1] = CHAR_PLUS; |
970 | convert_glob_write_str(&out, 2); |
971 | continue; |
972 | } |
973 | |
974 | if (c == CHAR_QUESTION_MARK) |
975 | { |
976 | if (no_wildsep) |
977 | convert_glob_write(&out, CHAR_DOT); |
978 | else |
979 | convert_glob_print_wildcard(&out, separator, with_escape); |
980 | continue; |
981 | } |
982 | |
983 | if (c == CHAR_LEFT_SQUARE_BRACKET) |
984 | { |
985 | result = convert_glob_parse_range(&pattern, pattern_end, |
986 | &out, utf, separator, with_escape, escape, no_wildsep); |
987 | if (result != 0) break; |
988 | continue; |
989 | } |
990 | |
991 | if (escape != 0 && c == escape) |
992 | { |
993 | if (pattern >= pattern_end) |
994 | { |
995 | result = PCRE2_ERROR_CONVERT_SYNTAX; |
996 | break; |
997 | } |
998 | c = *pattern++; |
999 | } |
1000 | |
1001 | if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL) |
1002 | convert_glob_write(&out, CHAR_BACKSLASH); |
1003 | |
1004 | convert_glob_write(&out, c); |
1005 | } |
1006 | |
1007 | if (result == 0) |
1008 | { |
1009 | if (!no_slash_z) |
1010 | { |
1011 | out.out_str[0] = CHAR_BACKSLASH; |
1012 | out.out_str[1] = CHAR_z; |
1013 | convert_glob_write_str(&out, 2); |
1014 | } |
1015 | |
1016 | if (in_atomic) |
1017 | convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS); |
1018 | |
1019 | convert_glob_write(&out, CHAR_NUL); |
1020 | |
1021 | if (!dummyrun && out.output_size != (PCRE2_SIZE) (out.output - use_buffer)) |
1022 | result = PCRE2_ERROR_NOMEMORY; |
1023 | } |
1024 | |
1025 | if (result != 0) |
1026 | { |
1027 | *bufflenptr = pattern - pattern_start; |
1028 | return result; |
1029 | } |
1030 | |
1031 | *bufflenptr = out.output_size - 1; |
1032 | return 0; |
1033 | } |
1034 | |
1035 | |
1036 | /************************************************* |
1037 | * Convert pattern * |
1038 | *************************************************/ |
1039 | |
1040 | /* This is the external-facing function for converting other forms of pattern |
1041 | into PCRE2 regular expression patterns. On error, the bufflenptr argument is |
1042 | used to return an offset in the original pattern. |
1043 | |
1044 | Arguments: |
1045 | pattern the input pattern |
1046 | plength length of input, or PCRE2_ZERO_TERMINATED |
1047 | options options bits |
1048 | buffptr pointer to pointer to output buffer |
1049 | bufflenptr pointer to length of output buffer |
1050 | ccontext convert context or NULL |
1051 | |
1052 | Returns: 0 for success, else an error code (+ve or -ve) |
1053 | */ |
1054 | |
1055 | PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION |
1056 | pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE plength, uint32_t options, |
1057 | PCRE2_UCHAR **buffptr, PCRE2_SIZE *bufflenptr, |
1058 | pcre2_convert_context *ccontext) |
1059 | { |
1060 | int i, rc; |
1061 | PCRE2_UCHAR dummy_buffer[DUMMY_BUFFER_SIZE]; |
1062 | PCRE2_UCHAR *use_buffer = dummy_buffer; |
1063 | PCRE2_SIZE use_length = DUMMY_BUFFER_SIZE; |
1064 | BOOL utf = (options & PCRE2_CONVERT_UTF) != 0; |
1065 | uint32_t pattype = options & TYPE_OPTIONS; |
1066 | |
1067 | if (pattern == NULL || bufflenptr == NULL) return PCRE2_ERROR_NULL; |
1068 | |
1069 | if ((options & ~ALL_OPTIONS) != 0 || /* Undefined bit set */ |
1070 | (pattype & (~pattype+1)) != pattype || /* More than one type set */ |
1071 | pattype == 0) /* No type set */ |
1072 | { |
1073 | *bufflenptr = 0; /* Error offset */ |
1074 | return PCRE2_ERROR_BADOPTION; |
1075 | } |
1076 | |
1077 | if (plength == PCRE2_ZERO_TERMINATED) plength = PRIV(strlen)(pattern); |
1078 | if (ccontext == NULL) ccontext = |
1079 | (pcre2_convert_context *)(&PRIV(default_convert_context)); |
1080 | |
1081 | /* Check UTF if required. */ |
1082 | |
1083 | #ifndef SUPPORT_UNICODE |
1084 | if (utf) |
1085 | { |
1086 | *bufflenptr = 0; /* Error offset */ |
1087 | return PCRE2_ERROR_UNICODE_NOT_SUPPORTED; |
1088 | } |
1089 | #else |
1090 | if (utf && (options & PCRE2_CONVERT_NO_UTF_CHECK) == 0) |
1091 | { |
1092 | PCRE2_SIZE erroroffset; |
1093 | rc = PRIV(valid_utf)(pattern, plength, &erroroffset); |
1094 | if (rc != 0) |
1095 | { |
1096 | *bufflenptr = erroroffset; |
1097 | return rc; |
1098 | } |
1099 | } |
1100 | #endif |
1101 | |
1102 | /* If buffptr is not NULL, and what it points to is not NULL, we are being |
1103 | provided with a buffer and a length, so set them as the buffer to use. */ |
1104 | |
1105 | if (buffptr != NULL && *buffptr != NULL) |
1106 | { |
1107 | use_buffer = *buffptr; |
1108 | use_length = *bufflenptr; |
1109 | } |
1110 | |
1111 | /* Call an individual converter, either just once (if a buffer was provided or |
1112 | just the length is needed), or twice (if a memory allocation is required). */ |
1113 | |
1114 | for (i = 0; i < 2; i++) |
1115 | { |
1116 | PCRE2_UCHAR *allocated; |
1117 | BOOL dummyrun = buffptr == NULL || *buffptr == NULL; |
1118 | |
1119 | switch(pattype) |
1120 | { |
1121 | case PCRE2_CONVERT_GLOB: |
1122 | rc = convert_glob(options & ~PCRE2_CONVERT_GLOB, pattern, plength, utf, |
1123 | use_buffer, use_length, bufflenptr, dummyrun, ccontext); |
1124 | break; |
1125 | |
1126 | case PCRE2_CONVERT_POSIX_BASIC: |
1127 | case PCRE2_CONVERT_POSIX_EXTENDED: |
1128 | rc = convert_posix(pattype, pattern, plength, utf, use_buffer, use_length, |
1129 | bufflenptr, dummyrun, ccontext); |
1130 | break; |
1131 | |
1132 | default: |
1133 | *bufflenptr = 0; /* Error offset */ |
1134 | return PCRE2_ERROR_INTERNAL; |
1135 | } |
1136 | |
1137 | if (rc != 0 || /* Error */ |
1138 | buffptr == NULL || /* Just the length is required */ |
1139 | *buffptr != NULL) /* Buffer was provided or allocated */ |
1140 | return rc; |
1141 | |
1142 | /* Allocate memory for the buffer, with hidden space for an allocator at |
1143 | the start. The next time round the loop runs the conversion for real. */ |
1144 | |
1145 | allocated = PRIV(memctl_malloc)(sizeof(pcre2_memctl) + |
1146 | (*bufflenptr + 1)*PCRE2_CODE_UNIT_WIDTH, (pcre2_memctl *)ccontext); |
1147 | if (allocated == NULL) return PCRE2_ERROR_NOMEMORY; |
1148 | *buffptr = (PCRE2_UCHAR *)(((char *)allocated) + sizeof(pcre2_memctl)); |
1149 | |
1150 | use_buffer = *buffptr; |
1151 | use_length = *bufflenptr + 1; |
1152 | } |
1153 | |
1154 | /* Control should never get here. */ |
1155 | |
1156 | return PCRE2_ERROR_INTERNAL; |
1157 | } |
1158 | |
1159 | |
1160 | /************************************************* |
1161 | * Free converted pattern * |
1162 | *************************************************/ |
1163 | |
1164 | /* This frees a converted pattern that was put in newly-allocated memory. |
1165 | |
1166 | Argument: the converted pattern |
1167 | Returns: nothing |
1168 | */ |
1169 | |
1170 | PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION |
1171 | pcre2_converted_pattern_free(PCRE2_UCHAR *converted) |
1172 | { |
1173 | if (converted != NULL) |
1174 | { |
1175 | pcre2_memctl *memctl = |
1176 | (pcre2_memctl *)((char *)converted - sizeof(pcre2_memctl)); |
1177 | memctl->free(memctl, memctl->memory_data); |
1178 | } |
1179 | } |
1180 | |
1181 | /* End of pcre2_convert.c */ |
1182 | |