1 | /************************************************* |
2 | * Perl-Compatible Regular Expressions * |
3 | *************************************************/ |
4 | |
5 | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | and semantics are as close as possible to those of the Perl 5 language. |
7 | |
8 | Written by Philip Hazel |
9 | Original API code Copyright (c) 1997-2012 University of Cambridge |
10 | New API code Copyright (c) 2016-2020 University of Cambridge |
11 | |
12 | ----------------------------------------------------------------------------- |
13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions are met: |
15 | |
16 | * Redistributions of source code must retain the above copyright notice, |
17 | this list of conditions and the following disclaimer. |
18 | |
19 | * Redistributions in binary form must reproduce the above copyright |
20 | notice, this list of conditions and the following disclaimer in the |
21 | documentation and/or other materials provided with the distribution. |
22 | |
23 | * Neither the name of the University of Cambridge nor the names of its |
24 | contributors may be used to endorse or promote products derived from |
25 | this software without specific prior written permission. |
26 | |
27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | POSSIBILITY OF SUCH DAMAGE. |
38 | ----------------------------------------------------------------------------- |
39 | */ |
40 | |
41 | |
42 | /* This module contains an internal function for validating UTF character |
43 | strings. This file is also #included by the pcre2test program, which uses |
44 | macros to change names from _pcre2_xxx to xxxx, thereby avoiding name clashes |
45 | with the library. In this case, PCRE2_PCRE2TEST is defined. */ |
46 | |
47 | #ifndef PCRE2_PCRE2TEST /* We're compiling the library */ |
48 | #ifdef HAVE_CONFIG_H |
49 | #include "config.h" |
50 | #endif |
51 | #include "pcre2_internal.h" |
52 | #endif /* PCRE2_PCRE2TEST */ |
53 | |
54 | |
55 | #ifndef SUPPORT_UNICODE |
56 | /************************************************* |
57 | * Dummy function when Unicode is not supported * |
58 | *************************************************/ |
59 | |
60 | /* This function should never be called when Unicode is not supported. */ |
61 | |
62 | int |
63 | PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset) |
64 | { |
65 | (void)string; |
66 | (void)length; |
67 | (void)erroroffset; |
68 | return 0; |
69 | } |
70 | #else /* UTF is supported */ |
71 | |
72 | |
73 | |
74 | /************************************************* |
75 | * Validate a UTF string * |
76 | *************************************************/ |
77 | |
78 | /* This function is called (optionally) at the start of compile or match, to |
79 | check that a supposed UTF string is actually valid. The early check means |
80 | that subsequent code can assume it is dealing with a valid string. The check |
81 | can be turned off for maximum performance, but the consequences of supplying an |
82 | invalid string are then undefined. |
83 | |
84 | Arguments: |
85 | string points to the string |
86 | length length of string |
87 | errp pointer to an error position offset variable |
88 | |
89 | Returns: == 0 if the string is a valid UTF string |
90 | != 0 otherwise, setting the offset of the bad character |
91 | */ |
92 | |
93 | int |
94 | PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset) |
95 | { |
96 | PCRE2_SPTR p; |
97 | uint32_t c; |
98 | |
99 | /* ----------------- Check a UTF-8 string ----------------- */ |
100 | |
101 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
102 | |
103 | /* Originally, this function checked according to RFC 2279, allowing for values |
104 | in the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were |
105 | in the canonical format. Once somebody had pointed out RFC 3629 to me (it |
106 | obsoletes 2279), additional restrictions were applied. The values are now |
107 | limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the |
108 | subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte |
109 | characters is still checked. Error returns are as follows: |
110 | |
111 | PCRE2_ERROR_UTF8_ERR1 Missing 1 byte at the end of the string |
112 | PCRE2_ERROR_UTF8_ERR2 Missing 2 bytes at the end of the string |
113 | PCRE2_ERROR_UTF8_ERR3 Missing 3 bytes at the end of the string |
114 | PCRE2_ERROR_UTF8_ERR4 Missing 4 bytes at the end of the string |
115 | PCRE2_ERROR_UTF8_ERR5 Missing 5 bytes at the end of the string |
116 | PCRE2_ERROR_UTF8_ERR6 2nd-byte's two top bits are not 0x80 |
117 | PCRE2_ERROR_UTF8_ERR7 3rd-byte's two top bits are not 0x80 |
118 | PCRE2_ERROR_UTF8_ERR8 4th-byte's two top bits are not 0x80 |
119 | PCRE2_ERROR_UTF8_ERR9 5th-byte's two top bits are not 0x80 |
120 | PCRE2_ERROR_UTF8_ERR10 6th-byte's two top bits are not 0x80 |
121 | PCRE2_ERROR_UTF8_ERR11 5-byte character is not permitted by RFC 3629 |
122 | PCRE2_ERROR_UTF8_ERR12 6-byte character is not permitted by RFC 3629 |
123 | PCRE2_ERROR_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted |
124 | PCRE2_ERROR_UTF8_ERR14 3-byte character with value 0xd800-0xdfff is not permitted |
125 | PCRE2_ERROR_UTF8_ERR15 Overlong 2-byte sequence |
126 | PCRE2_ERROR_UTF8_ERR16 Overlong 3-byte sequence |
127 | PCRE2_ERROR_UTF8_ERR17 Overlong 4-byte sequence |
128 | PCRE2_ERROR_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur) |
129 | PCRE2_ERROR_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur) |
130 | PCRE2_ERROR_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character) |
131 | PCRE2_ERROR_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff |
132 | */ |
133 | |
134 | for (p = string; length > 0; p++) |
135 | { |
136 | uint32_t ab, d; |
137 | |
138 | c = *p; |
139 | length--; |
140 | |
141 | if (c < 128) continue; /* ASCII character */ |
142 | |
143 | if (c < 0xc0) /* Isolated 10xx xxxx byte */ |
144 | { |
145 | *erroroffset = (PCRE2_SIZE)(p - string); |
146 | return PCRE2_ERROR_UTF8_ERR20; |
147 | } |
148 | |
149 | if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */ |
150 | { |
151 | *erroroffset = (PCRE2_SIZE)(p - string); |
152 | return PCRE2_ERROR_UTF8_ERR21; |
153 | } |
154 | |
155 | ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes (1-5) */ |
156 | if (length < ab) /* Missing bytes */ |
157 | { |
158 | *erroroffset = (PCRE2_SIZE)(p - string); |
159 | switch(ab - length) |
160 | { |
161 | case 1: return PCRE2_ERROR_UTF8_ERR1; |
162 | case 2: return PCRE2_ERROR_UTF8_ERR2; |
163 | case 3: return PCRE2_ERROR_UTF8_ERR3; |
164 | case 4: return PCRE2_ERROR_UTF8_ERR4; |
165 | case 5: return PCRE2_ERROR_UTF8_ERR5; |
166 | } |
167 | } |
168 | length -= ab; /* Length remaining */ |
169 | |
170 | /* Check top bits in the second byte */ |
171 | |
172 | if (((d = *(++p)) & 0xc0) != 0x80) |
173 | { |
174 | *erroroffset = (int)(p - string) - 1; |
175 | return PCRE2_ERROR_UTF8_ERR6; |
176 | } |
177 | |
178 | /* For each length, check that the remaining bytes start with the 0x80 bit |
179 | set and not the 0x40 bit. Then check for an overlong sequence, and for the |
180 | excluded range 0xd800 to 0xdfff. */ |
181 | |
182 | switch (ab) |
183 | { |
184 | /* 2-byte character. No further bytes to check for 0x80. Check first byte |
185 | for for xx00 000x (overlong sequence). */ |
186 | |
187 | case 1: if ((c & 0x3e) == 0) |
188 | { |
189 | *erroroffset = (int)(p - string) - 1; |
190 | return PCRE2_ERROR_UTF8_ERR15; |
191 | } |
192 | break; |
193 | |
194 | /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes |
195 | for 1110 0000, xx0x xxxx (overlong sequence) or |
196 | 1110 1101, 1010 xxxx (0xd800 - 0xdfff) */ |
197 | |
198 | case 2: |
199 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
200 | { |
201 | *erroroffset = (int)(p - string) - 2; |
202 | return PCRE2_ERROR_UTF8_ERR7; |
203 | } |
204 | if (c == 0xe0 && (d & 0x20) == 0) |
205 | { |
206 | *erroroffset = (int)(p - string) - 2; |
207 | return PCRE2_ERROR_UTF8_ERR16; |
208 | } |
209 | if (c == 0xed && d >= 0xa0) |
210 | { |
211 | *erroroffset = (int)(p - string) - 2; |
212 | return PCRE2_ERROR_UTF8_ERR14; |
213 | } |
214 | break; |
215 | |
216 | /* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2 |
217 | bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a |
218 | character greater than 0x0010ffff (f4 8f bf bf) */ |
219 | |
220 | case 3: |
221 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
222 | { |
223 | *erroroffset = (int)(p - string) - 2; |
224 | return PCRE2_ERROR_UTF8_ERR7; |
225 | } |
226 | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ |
227 | { |
228 | *erroroffset = (int)(p - string) - 3; |
229 | return PCRE2_ERROR_UTF8_ERR8; |
230 | } |
231 | if (c == 0xf0 && (d & 0x30) == 0) |
232 | { |
233 | *erroroffset = (int)(p - string) - 3; |
234 | return PCRE2_ERROR_UTF8_ERR17; |
235 | } |
236 | if (c > 0xf4 || (c == 0xf4 && d > 0x8f)) |
237 | { |
238 | *erroroffset = (int)(p - string) - 3; |
239 | return PCRE2_ERROR_UTF8_ERR13; |
240 | } |
241 | break; |
242 | |
243 | /* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be |
244 | rejected by the length test below. However, we do the appropriate tests |
245 | here so that overlong sequences get diagnosed, and also in case there is |
246 | ever an option for handling these larger code points. */ |
247 | |
248 | /* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for |
249 | 1111 1000, xx00 0xxx */ |
250 | |
251 | case 4: |
252 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
253 | { |
254 | *erroroffset = (int)(p - string) - 2; |
255 | return PCRE2_ERROR_UTF8_ERR7; |
256 | } |
257 | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ |
258 | { |
259 | *erroroffset = (int)(p - string) - 3; |
260 | return PCRE2_ERROR_UTF8_ERR8; |
261 | } |
262 | if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ |
263 | { |
264 | *erroroffset = (int)(p - string) - 4; |
265 | return PCRE2_ERROR_UTF8_ERR9; |
266 | } |
267 | if (c == 0xf8 && (d & 0x38) == 0) |
268 | { |
269 | *erroroffset = (int)(p - string) - 4; |
270 | return PCRE2_ERROR_UTF8_ERR18; |
271 | } |
272 | break; |
273 | |
274 | /* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for |
275 | 1111 1100, xx00 00xx. */ |
276 | |
277 | case 5: |
278 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
279 | { |
280 | *erroroffset = (int)(p - string) - 2; |
281 | return PCRE2_ERROR_UTF8_ERR7; |
282 | } |
283 | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ |
284 | { |
285 | *erroroffset = (int)(p - string) - 3; |
286 | return PCRE2_ERROR_UTF8_ERR8; |
287 | } |
288 | if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ |
289 | { |
290 | *erroroffset = (int)(p - string) - 4; |
291 | return PCRE2_ERROR_UTF8_ERR9; |
292 | } |
293 | if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */ |
294 | { |
295 | *erroroffset = (int)(p - string) - 5; |
296 | return PCRE2_ERROR_UTF8_ERR10; |
297 | } |
298 | if (c == 0xfc && (d & 0x3c) == 0) |
299 | { |
300 | *erroroffset = (int)(p - string) - 5; |
301 | return PCRE2_ERROR_UTF8_ERR19; |
302 | } |
303 | break; |
304 | } |
305 | |
306 | /* Character is valid under RFC 2279, but 4-byte and 5-byte characters are |
307 | excluded by RFC 3629. The pointer p is currently at the last byte of the |
308 | character. */ |
309 | |
310 | if (ab > 3) |
311 | { |
312 | *erroroffset = (int)(p - string) - ab; |
313 | return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12; |
314 | } |
315 | } |
316 | return 0; |
317 | |
318 | |
319 | /* ----------------- Check a UTF-16 string ----------------- */ |
320 | |
321 | #elif PCRE2_CODE_UNIT_WIDTH == 16 |
322 | |
323 | /* There's not so much work, nor so many errors, for UTF-16. |
324 | PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at the end of the string |
325 | PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate |
326 | PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate |
327 | */ |
328 | |
329 | for (p = string; length > 0; p++) |
330 | { |
331 | c = *p; |
332 | length--; |
333 | |
334 | if ((c & 0xf800) != 0xd800) |
335 | { |
336 | /* Normal UTF-16 code point. Neither high nor low surrogate. */ |
337 | } |
338 | else if ((c & 0x0400) == 0) |
339 | { |
340 | /* High surrogate. Must be a followed by a low surrogate. */ |
341 | if (length == 0) |
342 | { |
343 | *erroroffset = p - string; |
344 | return PCRE2_ERROR_UTF16_ERR1; |
345 | } |
346 | p++; |
347 | length--; |
348 | if ((*p & 0xfc00) != 0xdc00) |
349 | { |
350 | *erroroffset = p - string - 1; |
351 | return PCRE2_ERROR_UTF16_ERR2; |
352 | } |
353 | } |
354 | else |
355 | { |
356 | /* Isolated low surrogate. Always an error. */ |
357 | *erroroffset = p - string; |
358 | return PCRE2_ERROR_UTF16_ERR3; |
359 | } |
360 | } |
361 | return 0; |
362 | |
363 | |
364 | |
365 | /* ----------------- Check a UTF-32 string ----------------- */ |
366 | |
367 | #else |
368 | |
369 | /* There is very little to do for a UTF-32 string. |
370 | PCRE2_ERROR_UTF32_ERR1 Surrogate character |
371 | PCRE2_ERROR_UTF32_ERR2 Character > 0x10ffff |
372 | */ |
373 | |
374 | for (p = string; length > 0; length--, p++) |
375 | { |
376 | c = *p; |
377 | if ((c & 0xfffff800u) != 0xd800u) |
378 | { |
379 | /* Normal UTF-32 code point. Neither high nor low surrogate. */ |
380 | if (c > 0x10ffffu) |
381 | { |
382 | *erroroffset = p - string; |
383 | return PCRE2_ERROR_UTF32_ERR2; |
384 | } |
385 | } |
386 | else |
387 | { |
388 | /* A surrogate */ |
389 | *erroroffset = p - string; |
390 | return PCRE2_ERROR_UTF32_ERR1; |
391 | } |
392 | } |
393 | return 0; |
394 | #endif /* CODE_UNIT_WIDTH */ |
395 | } |
396 | #endif /* SUPPORT_UNICODE */ |
397 | |
398 | /* End of pcre2_valid_utf.c */ |
399 | |