1 | /************************************************* |
2 | * Perl-Compatible Regular Expressions * |
3 | *************************************************/ |
4 | |
5 | /* PCRE is a library of functions to support regular expressions whose syntax |
6 | and semantics are as close as possible to those of the Perl 5 language. |
7 | |
8 | Written by Philip Hazel |
9 | Copyright (c) 1997-2013 University of Cambridge |
10 | |
11 | ----------------------------------------------------------------------------- |
12 | Redistribution and use in source and binary forms, with or without |
13 | modification, are permitted provided that the following conditions are met: |
14 | |
15 | * Redistributions of source code must retain the above copyright notice, |
16 | this list of conditions and the following disclaimer. |
17 | |
18 | * Redistributions in binary form must reproduce the above copyright |
19 | notice, this list of conditions and the following disclaimer in the |
20 | documentation and/or other materials provided with the distribution. |
21 | |
22 | * Neither the name of the University of Cambridge nor the names of its |
23 | contributors may be used to endorse or promote products derived from |
24 | this software without specific prior written permission. |
25 | |
26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
28 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
29 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
30 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
31 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
32 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
33 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
34 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
35 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
36 | POSSIBILITY OF SUCH DAMAGE. |
37 | ----------------------------------------------------------------------------- |
38 | */ |
39 | |
40 | |
41 | /* This module contains an internal function for validating UTF-8 character |
42 | strings. */ |
43 | |
44 | |
45 | #include "pcre_config.h" |
46 | #include "pcre_internal.h" |
47 | |
48 | |
49 | /************************************************* |
50 | * Validate a UTF-8 string * |
51 | *************************************************/ |
52 | |
53 | /* This function is called (optionally) at the start of compile or match, to |
54 | check that a supposed UTF-8 string is actually valid. The early check means |
55 | that subsequent code can assume it is dealing with a valid string. The check |
56 | can be turned off for maximum performance, but the consequences of supplying an |
57 | invalid string are then undefined. |
58 | |
59 | Originally, this function checked according to RFC 2279, allowing for values in |
60 | the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in |
61 | the canonical format. Once somebody had pointed out RFC 3629 to me (it |
62 | obsoletes 2279), additional restrictions were applied. The values are now |
63 | limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the |
64 | subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte |
65 | characters is still checked. |
66 | |
67 | From release 8.13 more information about the details of the error are passed |
68 | back in the returned value: |
69 | |
70 | PCRE_UTF8_ERR0 No error |
71 | PCRE_UTF8_ERR1 Missing 1 byte at the end of the string |
72 | PCRE_UTF8_ERR2 Missing 2 bytes at the end of the string |
73 | PCRE_UTF8_ERR3 Missing 3 bytes at the end of the string |
74 | PCRE_UTF8_ERR4 Missing 4 bytes at the end of the string |
75 | PCRE_UTF8_ERR5 Missing 5 bytes at the end of the string |
76 | PCRE_UTF8_ERR6 2nd-byte's two top bits are not 0x80 |
77 | PCRE_UTF8_ERR7 3rd-byte's two top bits are not 0x80 |
78 | PCRE_UTF8_ERR8 4th-byte's two top bits are not 0x80 |
79 | PCRE_UTF8_ERR9 5th-byte's two top bits are not 0x80 |
80 | PCRE_UTF8_ERR10 6th-byte's two top bits are not 0x80 |
81 | PCRE_UTF8_ERR11 5-byte character is not permitted by RFC 3629 |
82 | PCRE_UTF8_ERR12 6-byte character is not permitted by RFC 3629 |
83 | PCRE_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted |
84 | PCRE_UTF8_ERR14 3-byte character with value 0xd000-0xdfff is not permitted |
85 | PCRE_UTF8_ERR15 Overlong 2-byte sequence |
86 | PCRE_UTF8_ERR16 Overlong 3-byte sequence |
87 | PCRE_UTF8_ERR17 Overlong 4-byte sequence |
88 | PCRE_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur) |
89 | PCRE_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur) |
90 | PCRE_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character) |
91 | PCRE_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff |
92 | PCRE_UTF8_ERR22 Unused (was non-character) |
93 | |
94 | Arguments: |
95 | string points to the string |
96 | length length of string, or -1 if the string is zero-terminated |
97 | errp pointer to an error position offset variable |
98 | |
99 | Returns: = 0 if the string is a valid UTF-8 string |
100 | > 0 otherwise, setting the offset of the bad character |
101 | */ |
102 | |
103 | int |
104 | PRIV(valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset) |
105 | { |
106 | #ifdef SUPPORT_UTF |
107 | register PCRE_PUCHAR p; |
108 | |
109 | if (length < 0) |
110 | { |
111 | for (p = string; *p != 0; p++); |
112 | length = (int)(p - string); |
113 | } |
114 | |
115 | for (p = string; length-- > 0; p++) |
116 | { |
117 | register pcre_uchar ab, c, d; |
118 | |
119 | c = *p; |
120 | if (c < 128) continue; /* ASCII character */ |
121 | |
122 | if (c < 0xc0) /* Isolated 10xx xxxx byte */ |
123 | { |
124 | *erroroffset = (int)(p - string); |
125 | return PCRE_UTF8_ERR20; |
126 | } |
127 | |
128 | if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */ |
129 | { |
130 | *erroroffset = (int)(p - string); |
131 | return PCRE_UTF8_ERR21; |
132 | } |
133 | |
134 | ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes */ |
135 | if (length < ab) |
136 | { |
137 | *erroroffset = (int)(p - string); /* Missing bytes */ |
138 | return ab - length; /* Codes ERR1 to ERR5 */ |
139 | } |
140 | length -= ab; /* Length remaining */ |
141 | |
142 | /* Check top bits in the second byte */ |
143 | |
144 | if (((d = *(++p)) & 0xc0) != 0x80) |
145 | { |
146 | *erroroffset = (int)(p - string) - 1; |
147 | return PCRE_UTF8_ERR6; |
148 | } |
149 | |
150 | /* For each length, check that the remaining bytes start with the 0x80 bit |
151 | set and not the 0x40 bit. Then check for an overlong sequence, and for the |
152 | excluded range 0xd800 to 0xdfff. */ |
153 | |
154 | switch (ab) |
155 | { |
156 | /* 2-byte character. No further bytes to check for 0x80. Check first byte |
157 | for for xx00 000x (overlong sequence). */ |
158 | |
159 | case 1: if ((c & 0x3e) == 0) |
160 | { |
161 | *erroroffset = (int)(p - string) - 1; |
162 | return PCRE_UTF8_ERR15; |
163 | } |
164 | break; |
165 | |
166 | /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes |
167 | for 1110 0000, xx0x xxxx (overlong sequence) or |
168 | 1110 1101, 1010 xxxx (0xd800 - 0xdfff) */ |
169 | |
170 | case 2: |
171 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
172 | { |
173 | *erroroffset = (int)(p - string) - 2; |
174 | return PCRE_UTF8_ERR7; |
175 | } |
176 | if (c == 0xe0 && (d & 0x20) == 0) |
177 | { |
178 | *erroroffset = (int)(p - string) - 2; |
179 | return PCRE_UTF8_ERR16; |
180 | } |
181 | if (c == 0xed && d >= 0xa0) |
182 | { |
183 | *erroroffset = (int)(p - string) - 2; |
184 | return PCRE_UTF8_ERR14; |
185 | } |
186 | break; |
187 | |
188 | /* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2 |
189 | bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a |
190 | character greater than 0x0010ffff (f4 8f bf bf) */ |
191 | |
192 | case 3: |
193 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
194 | { |
195 | *erroroffset = (int)(p - string) - 2; |
196 | return PCRE_UTF8_ERR7; |
197 | } |
198 | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ |
199 | { |
200 | *erroroffset = (int)(p - string) - 3; |
201 | return PCRE_UTF8_ERR8; |
202 | } |
203 | if (c == 0xf0 && (d & 0x30) == 0) |
204 | { |
205 | *erroroffset = (int)(p - string) - 3; |
206 | return PCRE_UTF8_ERR17; |
207 | } |
208 | if (c > 0xf4 || (c == 0xf4 && d > 0x8f)) |
209 | { |
210 | *erroroffset = (int)(p - string) - 3; |
211 | return PCRE_UTF8_ERR13; |
212 | } |
213 | break; |
214 | |
215 | /* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be |
216 | rejected by the length test below. However, we do the appropriate tests |
217 | here so that overlong sequences get diagnosed, and also in case there is |
218 | ever an option for handling these larger code points. */ |
219 | |
220 | /* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for |
221 | 1111 1000, xx00 0xxx */ |
222 | |
223 | case 4: |
224 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
225 | { |
226 | *erroroffset = (int)(p - string) - 2; |
227 | return PCRE_UTF8_ERR7; |
228 | } |
229 | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ |
230 | { |
231 | *erroroffset = (int)(p - string) - 3; |
232 | return PCRE_UTF8_ERR8; |
233 | } |
234 | if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ |
235 | { |
236 | *erroroffset = (int)(p - string) - 4; |
237 | return PCRE_UTF8_ERR9; |
238 | } |
239 | if (c == 0xf8 && (d & 0x38) == 0) |
240 | { |
241 | *erroroffset = (int)(p - string) - 4; |
242 | return PCRE_UTF8_ERR18; |
243 | } |
244 | break; |
245 | |
246 | /* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for |
247 | 1111 1100, xx00 00xx. */ |
248 | |
249 | case 5: |
250 | if ((*(++p) & 0xc0) != 0x80) /* Third byte */ |
251 | { |
252 | *erroroffset = (int)(p - string) - 2; |
253 | return PCRE_UTF8_ERR7; |
254 | } |
255 | if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */ |
256 | { |
257 | *erroroffset = (int)(p - string) - 3; |
258 | return PCRE_UTF8_ERR8; |
259 | } |
260 | if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */ |
261 | { |
262 | *erroroffset = (int)(p - string) - 4; |
263 | return PCRE_UTF8_ERR9; |
264 | } |
265 | if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */ |
266 | { |
267 | *erroroffset = (int)(p - string) - 5; |
268 | return PCRE_UTF8_ERR10; |
269 | } |
270 | if (c == 0xfc && (d & 0x3c) == 0) |
271 | { |
272 | *erroroffset = (int)(p - string) - 5; |
273 | return PCRE_UTF8_ERR19; |
274 | } |
275 | break; |
276 | } |
277 | |
278 | /* Character is valid under RFC 2279, but 4-byte and 5-byte characters are |
279 | excluded by RFC 3629. The pointer p is currently at the last byte of the |
280 | character. */ |
281 | |
282 | if (ab > 3) |
283 | { |
284 | *erroroffset = (int)(p - string) - ab; |
285 | return (ab == 4)? PCRE_UTF8_ERR11 : PCRE_UTF8_ERR12; |
286 | } |
287 | } |
288 | |
289 | #else /* Not SUPPORT_UTF */ |
290 | (void)(string); /* Keep picky compilers happy */ |
291 | (void)(length); |
292 | (void)(erroroffset); |
293 | #endif |
294 | |
295 | return PCRE_UTF8_ERR0; /* This indicates success */ |
296 | } |
297 | |
298 | /* End of pcre_valid_utf8.c */ |
299 | |