1 | /* Copyright JS Foundation and other contributors, http://js.foundation |
2 | * |
3 | * Licensed under the Apache License, Version 2.0 (the "License"); |
4 | * you may not use this file except in compliance with the License. |
5 | * You may obtain a copy of the License at |
6 | * |
7 | * http://www.apache.org/licenses/LICENSE-2.0 |
8 | * |
9 | * Unless required by applicable law or agreed to in writing, software |
10 | * distributed under the License is distributed on an "AS IS" BASIS |
11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | * See the License for the specific language governing permissions and |
13 | * limitations under the License. |
14 | */ |
15 | |
16 | #include "lit-strings.h" |
17 | |
18 | #include "jrt-libc-includes.h" |
19 | |
20 | /** |
21 | * Validate utf-8 string |
22 | * |
23 | * NOTE: |
24 | * Isolated surrogates are allowed. |
25 | * Correct pair of surrogates is not allowed, it should be represented as 4-byte utf-8 character. |
26 | * |
27 | * @return true if utf-8 string is well-formed |
28 | * false otherwise |
29 | */ |
30 | bool |
31 | lit_is_valid_utf8_string (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */ |
32 | lit_utf8_size_t buf_size) /**< string size */ |
33 | { |
34 | lit_utf8_size_t idx = 0; |
35 | |
36 | bool is_prev_code_point_high_surrogate = false; |
37 | while (idx < buf_size) |
38 | { |
39 | lit_utf8_byte_t c = utf8_buf_p[idx++]; |
40 | if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) |
41 | { |
42 | is_prev_code_point_high_surrogate = false; |
43 | continue; |
44 | } |
45 | |
46 | lit_code_point_t code_point = 0; |
47 | lit_code_point_t min_code_point = 0; |
48 | lit_utf8_size_t ; |
49 | if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER) |
50 | { |
51 | extra_bytes_count = 1; |
52 | min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN; |
53 | code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK)); |
54 | } |
55 | else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER) |
56 | { |
57 | extra_bytes_count = 2; |
58 | min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN; |
59 | code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK)); |
60 | } |
61 | else if ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER) |
62 | { |
63 | extra_bytes_count = 3; |
64 | min_code_point = LIT_UTF8_4_BYTE_CODE_POINT_MIN; |
65 | code_point = ((uint32_t) (c & LIT_UTF8_LAST_3_BITS_MASK)); |
66 | } |
67 | else |
68 | { |
69 | /* utf-8 string could not contain 5- and 6-byte sequences. */ |
70 | return false; |
71 | } |
72 | |
73 | if (idx + extra_bytes_count > buf_size) |
74 | { |
75 | /* utf-8 string breaks in the middle */ |
76 | return false; |
77 | } |
78 | |
79 | for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset) |
80 | { |
81 | c = utf8_buf_p[idx + offset]; |
82 | if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER) |
83 | { |
84 | /* invalid continuation byte */ |
85 | return false; |
86 | } |
87 | code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES; |
88 | code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK); |
89 | } |
90 | |
91 | if (code_point < min_code_point |
92 | || code_point > LIT_UNICODE_CODE_POINT_MAX) |
93 | { |
94 | /* utf-8 string doesn't encode valid unicode code point */ |
95 | return false; |
96 | } |
97 | |
98 | if (code_point >= LIT_UTF16_HIGH_SURROGATE_MIN |
99 | && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX) |
100 | { |
101 | is_prev_code_point_high_surrogate = true; |
102 | } |
103 | else if (code_point >= LIT_UTF16_LOW_SURROGATE_MIN |
104 | && code_point <= LIT_UTF16_LOW_SURROGATE_MAX |
105 | && is_prev_code_point_high_surrogate) |
106 | { |
107 | /* sequence of high and low surrogate is not allowed */ |
108 | return false; |
109 | } |
110 | else |
111 | { |
112 | is_prev_code_point_high_surrogate = false; |
113 | } |
114 | |
115 | idx += extra_bytes_count; |
116 | } |
117 | |
118 | return true; |
119 | } /* lit_is_valid_utf8_string */ |
120 | |
121 | /** |
122 | * Validate cesu-8 string |
123 | * |
124 | * @return true if cesu-8 string is well-formed |
125 | * false otherwise |
126 | */ |
127 | bool |
128 | lit_is_valid_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */ |
129 | lit_utf8_size_t buf_size) /**< string size */ |
130 | { |
131 | lit_utf8_size_t idx = 0; |
132 | |
133 | while (idx < buf_size) |
134 | { |
135 | lit_utf8_byte_t c = cesu8_buf_p[idx++]; |
136 | if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) |
137 | { |
138 | continue; |
139 | } |
140 | |
141 | lit_code_point_t code_point = 0; |
142 | lit_code_point_t min_code_point = 0; |
143 | lit_utf8_size_t ; |
144 | if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER) |
145 | { |
146 | extra_bytes_count = 1; |
147 | min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN; |
148 | code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK)); |
149 | } |
150 | else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER) |
151 | { |
152 | extra_bytes_count = 2; |
153 | min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN; |
154 | code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK)); |
155 | } |
156 | else |
157 | { |
158 | return false; |
159 | } |
160 | |
161 | if (idx + extra_bytes_count > buf_size) |
162 | { |
163 | /* cesu-8 string breaks in the middle */ |
164 | return false; |
165 | } |
166 | |
167 | for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset) |
168 | { |
169 | c = cesu8_buf_p[idx + offset]; |
170 | if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER) |
171 | { |
172 | /* invalid continuation byte */ |
173 | return false; |
174 | } |
175 | code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES; |
176 | code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK); |
177 | } |
178 | |
179 | if (code_point < min_code_point) |
180 | { |
181 | /* cesu-8 string doesn't encode valid unicode code point */ |
182 | return false; |
183 | } |
184 | |
185 | idx += extra_bytes_count; |
186 | } |
187 | |
188 | return true; |
189 | } /* lit_is_valid_cesu8_string */ |
190 | |
191 | /** |
192 | * Check if the code point is UTF-16 low surrogate |
193 | * |
194 | * @return true / false |
195 | */ |
196 | bool |
197 | lit_is_code_point_utf16_low_surrogate (lit_code_point_t code_point) /**< code point */ |
198 | { |
199 | return LIT_UTF16_LOW_SURROGATE_MIN <= code_point && code_point <= LIT_UTF16_LOW_SURROGATE_MAX; |
200 | } /* lit_is_code_point_utf16_low_surrogate */ |
201 | |
202 | /** |
203 | * Check if the code point is UTF-16 high surrogate |
204 | * |
205 | * @return true / false |
206 | */ |
207 | bool |
208 | lit_is_code_point_utf16_high_surrogate (lit_code_point_t code_point) /**< code point */ |
209 | { |
210 | return LIT_UTF16_HIGH_SURROGATE_MIN <= code_point && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX; |
211 | } /* lit_is_code_point_utf16_high_surrogate */ |
212 | |
213 | /** |
214 | * Represents code point (>0xFFFF) as surrogate pair and returns its lower part |
215 | * |
216 | * @return lower code_unit of the surrogate pair |
217 | */ |
218 | static ecma_char_t |
219 | convert_code_point_to_low_surrogate (lit_code_point_t code_point) /**< code point, should be > 0xFFFF */ |
220 | { |
221 | JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX); |
222 | |
223 | ecma_char_t code_unit_bits; |
224 | code_unit_bits = (ecma_char_t) (code_point & LIT_UTF16_LAST_10_BITS_MASK); |
225 | |
226 | return (ecma_char_t) (LIT_UTF16_LOW_SURROGATE_MARKER | code_unit_bits); |
227 | } /* convert_code_point_to_low_surrogate */ |
228 | |
229 | /** |
230 | * Represents code point (>0xFFFF) as surrogate pair and returns its higher part |
231 | * |
232 | * @return higher code_unit of the surrogate pair |
233 | */ |
234 | static ecma_char_t |
235 | convert_code_point_to_high_surrogate (lit_code_point_t code_point) /**< code point, should be > 0xFFFF */ |
236 | { |
237 | JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX); |
238 | JERRY_ASSERT (code_point <= LIT_UNICODE_CODE_POINT_MAX); |
239 | |
240 | ecma_char_t code_unit_bits; |
241 | code_unit_bits = (ecma_char_t) ((code_point - LIT_UTF16_FIRST_SURROGATE_CODE_POINT) >> LIT_UTF16_BITS_IN_SURROGATE); |
242 | |
243 | return (LIT_UTF16_HIGH_SURROGATE_MARKER | code_unit_bits); |
244 | } /* convert_code_point_to_high_surrogate */ |
245 | |
246 | /** |
247 | * UTF16 Encoding method for a code point |
248 | * |
249 | * See also: |
250 | * ECMA-262 v6, 10.1.1 |
251 | * |
252 | * @return uint8_t, the number of returning code points |
253 | */ |
254 | uint8_t |
255 | lit_utf16_encode_code_point (lit_code_point_t cp, /**< the code point we encode */ |
256 | ecma_char_t *cu_p) /**< result of the encoding */ |
257 | { |
258 | if (cp <= LIT_UTF16_CODE_UNIT_MAX) |
259 | { |
260 | cu_p[0] = (ecma_char_t) cp; |
261 | return 1; |
262 | } |
263 | |
264 | cu_p[0] = convert_code_point_to_high_surrogate (cp); |
265 | cu_p[1] = convert_code_point_to_low_surrogate (cp); |
266 | return 2; |
267 | } /* lit_utf16_encode_code_point */ |
268 | |
269 | /** |
270 | * Calculate size of a zero-terminated utf-8 string |
271 | * |
272 | * NOTE: |
273 | * - string cannot be NULL |
274 | * - string should not contain zero characters in the middle |
275 | * |
276 | * @return size of a string |
277 | */ |
278 | lit_utf8_size_t |
279 | lit_zt_utf8_string_size (const lit_utf8_byte_t *utf8_str_p) /**< zero-terminated utf-8 string */ |
280 | { |
281 | JERRY_ASSERT (utf8_str_p != NULL); |
282 | return (lit_utf8_size_t) strlen ((const char *) utf8_str_p); |
283 | } /* lit_zt_utf8_string_size */ |
284 | |
285 | /** |
286 | * Calculate length of a cesu-8 encoded string |
287 | * |
288 | * @return UTF-16 code units count |
289 | */ |
290 | lit_utf8_size_t |
291 | lit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */ |
292 | lit_utf8_size_t utf8_buf_size) /**< string size */ |
293 | { |
294 | lit_utf8_size_t length = 0; |
295 | lit_utf8_size_t size = 0; |
296 | |
297 | while (size < utf8_buf_size) |
298 | { |
299 | size += lit_get_unicode_char_size_by_utf8_first_byte (*(utf8_buf_p + size)); |
300 | length++; |
301 | } |
302 | |
303 | JERRY_ASSERT (size == utf8_buf_size); |
304 | |
305 | return length; |
306 | } /* lit_utf8_string_length */ |
307 | |
308 | /** |
309 | * Calculate the required size of an utf-8 encoded string from cesu-8 encoded string |
310 | * |
311 | * @return size of an utf-8 encoded string |
312 | */ |
313 | lit_utf8_size_t |
314 | lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */ |
315 | lit_utf8_size_t cesu8_buf_size) /**< string size */ |
316 | { |
317 | lit_utf8_size_t offset = 0; |
318 | lit_utf8_size_t utf8_buf_size = cesu8_buf_size; |
319 | ecma_char_t prev_ch = 0; |
320 | |
321 | while (offset < cesu8_buf_size) |
322 | { |
323 | ecma_char_t ch; |
324 | offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch); |
325 | |
326 | if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch)) |
327 | { |
328 | utf8_buf_size -= 2; |
329 | } |
330 | |
331 | prev_ch = ch; |
332 | } |
333 | |
334 | JERRY_ASSERT (offset == cesu8_buf_size); |
335 | |
336 | return utf8_buf_size; |
337 | } /* lit_get_utf8_size_of_cesu8_string */ |
338 | |
339 | /** |
340 | * Calculate length of an utf-8 encoded string from cesu-8 encoded string |
341 | * |
342 | * @return length of an utf-8 encoded string |
343 | */ |
344 | lit_utf8_size_t |
345 | lit_get_utf8_length_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */ |
346 | lit_utf8_size_t cesu8_buf_size) /**< string size */ |
347 | { |
348 | lit_utf8_size_t offset = 0; |
349 | lit_utf8_size_t utf8_length = 0; |
350 | ecma_char_t prev_ch = 0; |
351 | |
352 | while (offset < cesu8_buf_size) |
353 | { |
354 | ecma_char_t ch; |
355 | offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch); |
356 | |
357 | if (!lit_is_code_point_utf16_low_surrogate (ch) || !lit_is_code_point_utf16_high_surrogate (prev_ch)) |
358 | { |
359 | utf8_length++; |
360 | } |
361 | |
362 | prev_ch = ch; |
363 | } |
364 | |
365 | JERRY_ASSERT (offset == cesu8_buf_size); |
366 | |
367 | return utf8_length; |
368 | } /* lit_get_utf8_length_of_cesu8_string */ |
369 | |
370 | /** |
371 | * Decodes a unicode code point from non-empty utf-8-encoded buffer |
372 | * |
373 | * @return number of bytes occupied by code point in the string |
374 | */ |
375 | lit_utf8_size_t |
376 | lit_read_code_point_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */ |
377 | lit_utf8_size_t buf_size, /**< size of the buffer in bytes */ |
378 | lit_code_point_t *code_point) /**< [out] code point */ |
379 | { |
380 | JERRY_ASSERT (buf_p && buf_size); |
381 | |
382 | lit_utf8_byte_t c = buf_p[0]; |
383 | if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) |
384 | { |
385 | *code_point = (lit_code_point_t) (c & LIT_UTF8_LAST_7_BITS_MASK); |
386 | return 1; |
387 | } |
388 | |
389 | lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL; |
390 | lit_utf8_size_t bytes_count = 0; |
391 | if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER) |
392 | { |
393 | bytes_count = 2; |
394 | ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK)); |
395 | } |
396 | else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER) |
397 | { |
398 | bytes_count = 3; |
399 | ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK)); |
400 | } |
401 | else |
402 | { |
403 | JERRY_ASSERT ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER); |
404 | bytes_count = 4; |
405 | ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_3_BITS_MASK)); |
406 | } |
407 | |
408 | JERRY_ASSERT (buf_size >= bytes_count); |
409 | |
410 | for (uint32_t i = 1; i < bytes_count; ++i) |
411 | { |
412 | ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES; |
413 | ret |= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK); |
414 | } |
415 | |
416 | *code_point = ret; |
417 | return bytes_count; |
418 | } /* lit_read_code_point_from_utf8 */ |
419 | |
420 | /** |
421 | * Decodes a unicode code unit from non-empty cesu-8-encoded buffer |
422 | * |
423 | * @return number of bytes occupied by code point in the string |
424 | */ |
425 | lit_utf8_size_t |
426 | lit_read_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */ |
427 | ecma_char_t *code_point) /**< [out] code point */ |
428 | { |
429 | JERRY_ASSERT (buf_p); |
430 | |
431 | lit_utf8_byte_t c = buf_p[0]; |
432 | if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) |
433 | { |
434 | *code_point = (ecma_char_t) (c & LIT_UTF8_LAST_7_BITS_MASK); |
435 | return 1; |
436 | } |
437 | |
438 | lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL; |
439 | lit_utf8_size_t bytes_count; |
440 | if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER) |
441 | { |
442 | bytes_count = 2; |
443 | ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK)); |
444 | } |
445 | else |
446 | { |
447 | JERRY_ASSERT ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER); |
448 | bytes_count = 3; |
449 | ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK)); |
450 | } |
451 | |
452 | for (uint32_t i = 1; i < bytes_count; ++i) |
453 | { |
454 | ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES; |
455 | ret |= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK); |
456 | } |
457 | |
458 | JERRY_ASSERT (ret <= LIT_UTF16_CODE_UNIT_MAX); |
459 | *code_point = (ecma_char_t) ret; |
460 | return bytes_count; |
461 | } /* lit_read_code_unit_from_utf8 */ |
462 | |
463 | /** |
464 | * Decodes a unicode code unit from non-empty cesu-8-encoded buffer |
465 | * |
466 | * @return number of bytes occupied by code point in the string |
467 | */ |
468 | lit_utf8_size_t |
469 | lit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */ |
470 | ecma_char_t *code_point) /**< [out] code point */ |
471 | { |
472 | JERRY_ASSERT (buf_p); |
473 | |
474 | lit_utf8_decr (&buf_p); |
475 | return lit_read_code_unit_from_utf8 (buf_p, code_point); |
476 | } /* lit_read_prev_code_unit_from_utf8 */ |
477 | |
478 | /** |
479 | * Decodes a unicode code unit from non-empty cesu-8-encoded buffer |
480 | * |
481 | * @return next code unit |
482 | */ |
483 | ecma_char_t |
484 | lit_cesu8_read_next (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */ |
485 | { |
486 | JERRY_ASSERT (*buf_p); |
487 | ecma_char_t ch; |
488 | |
489 | *buf_p += lit_read_code_unit_from_utf8 (*buf_p, &ch); |
490 | |
491 | return ch; |
492 | } /* lit_cesu8_read_next */ |
493 | |
494 | /** |
495 | * Decodes a unicode code unit from non-empty cesu-8-encoded buffer |
496 | * |
497 | * @return previous code unit |
498 | */ |
499 | ecma_char_t |
500 | lit_cesu8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */ |
501 | { |
502 | JERRY_ASSERT (*buf_p); |
503 | ecma_char_t ch; |
504 | |
505 | lit_utf8_decr (buf_p); |
506 | lit_read_code_unit_from_utf8 (*buf_p, &ch); |
507 | |
508 | return ch; |
509 | } /* lit_cesu8_read_prev */ |
510 | |
511 | /** |
512 | * Decodes a unicode code unit from non-empty cesu-8-encoded buffer |
513 | * |
514 | * @return next code unit |
515 | */ |
516 | ecma_char_t JERRY_ATTR_NOINLINE |
517 | lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */ |
518 | { |
519 | JERRY_ASSERT (buf_p != NULL); |
520 | ecma_char_t ch; |
521 | |
522 | lit_read_code_unit_from_utf8 (buf_p, &ch); |
523 | |
524 | return ch; |
525 | } /* lit_cesu8_peek_next */ |
526 | |
527 | /** |
528 | * Decodes a unicode code unit from non-empty cesu-8-encoded buffer |
529 | * |
530 | * @return previous code unit |
531 | */ |
532 | ecma_char_t JERRY_ATTR_NOINLINE |
533 | lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */ |
534 | { |
535 | JERRY_ASSERT (buf_p != NULL); |
536 | ecma_char_t ch; |
537 | |
538 | lit_read_prev_code_unit_from_utf8 (buf_p, &ch); |
539 | |
540 | return ch; |
541 | } /* lit_cesu8_peek_prev */ |
542 | |
543 | /** |
544 | * Increase cesu-8 encoded string pointer by one code unit. |
545 | */ |
546 | extern inline void JERRY_ATTR_ALWAYS_INLINE |
547 | lit_utf8_incr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */ |
548 | { |
549 | JERRY_ASSERT (*buf_p); |
550 | |
551 | *buf_p += lit_get_unicode_char_size_by_utf8_first_byte (**buf_p); |
552 | } /* lit_utf8_incr */ |
553 | |
554 | /** |
555 | * Decrease cesu-8 encoded string pointer by one code unit. |
556 | */ |
557 | void |
558 | lit_utf8_decr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */ |
559 | { |
560 | JERRY_ASSERT (*buf_p); |
561 | const lit_utf8_byte_t *current_p = *buf_p; |
562 | |
563 | do |
564 | { |
565 | current_p--; |
566 | } |
567 | while ((*(current_p) & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER); |
568 | |
569 | *buf_p = current_p; |
570 | } /* lit_utf8_decr */ |
571 | |
572 | /** |
573 | * Calc hash using the specified hash_basis. |
574 | * |
575 | * NOTE: |
576 | * This is implementation of FNV-1a hash function, which is released into public domain. |
577 | * Constants used, are carefully picked primes by the authors. |
578 | * More info: http://www.isthe.com/chongo/tech/comp/fnv/ |
579 | * |
580 | * @return ecma-string's hash |
581 | */ |
582 | extern inline lit_string_hash_t JERRY_ATTR_ALWAYS_INLINE |
583 | lit_utf8_string_hash_combine (lit_string_hash_t hash_basis, /**< hash to be combined with */ |
584 | const lit_utf8_byte_t *utf8_buf_p, /**< characters buffer */ |
585 | lit_utf8_size_t utf8_buf_size) /**< number of characters in the buffer */ |
586 | { |
587 | JERRY_ASSERT (utf8_buf_p != NULL || utf8_buf_size == 0); |
588 | |
589 | uint32_t hash = hash_basis; |
590 | |
591 | for (uint32_t i = 0; i < utf8_buf_size; i++) |
592 | { |
593 | /* 16777619 is 32 bit FNV_prime = 2^24 + 2^8 + 0x93 = 16777619 */ |
594 | hash = (hash ^ utf8_buf_p[i]) * 16777619; |
595 | } |
596 | |
597 | return (lit_string_hash_t) hash; |
598 | } /* lit_utf8_string_hash_combine */ |
599 | |
600 | /** |
601 | * Calculate hash from the buffer. |
602 | * |
603 | * @return ecma-string's hash |
604 | */ |
605 | extern inline lit_string_hash_t JERRY_ATTR_ALWAYS_INLINE |
606 | lit_utf8_string_calc_hash (const lit_utf8_byte_t *utf8_buf_p, /**< characters buffer */ |
607 | lit_utf8_size_t utf8_buf_size) /**< number of characters in the buffer */ |
608 | { |
609 | JERRY_ASSERT (utf8_buf_p != NULL || utf8_buf_size == 0); |
610 | |
611 | /* 32 bit offset_basis for FNV = 2166136261 */ |
612 | return lit_utf8_string_hash_combine ((lit_string_hash_t) 2166136261, utf8_buf_p, utf8_buf_size); |
613 | } /* lit_utf8_string_calc_hash */ |
614 | |
615 | /** |
616 | * Return code unit at the specified position in string |
617 | * |
618 | * NOTE: |
619 | * code_unit_offset should be less then string's length |
620 | * |
621 | * @return code unit value |
622 | */ |
623 | ecma_char_t |
624 | lit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */ |
625 | lit_utf8_size_t utf8_buf_size, /**< string size in bytes */ |
626 | lit_utf8_size_t code_unit_offset) /**< ofset of a code_unit */ |
627 | { |
628 | lit_utf8_byte_t *current_p = (lit_utf8_byte_t *) utf8_buf_p; |
629 | ecma_char_t code_unit; |
630 | |
631 | do |
632 | { |
633 | JERRY_ASSERT (current_p < utf8_buf_p + utf8_buf_size); |
634 | current_p += lit_read_code_unit_from_utf8 (current_p, &code_unit); |
635 | } |
636 | while (code_unit_offset--); |
637 | |
638 | return code_unit; |
639 | } /* lit_utf8_string_code_unit_at */ |
640 | |
641 | /** |
642 | * Get CESU-8 encoded size of character |
643 | * |
644 | * @return number of bytes occupied in CESU-8 |
645 | */ |
646 | extern inline lit_utf8_size_t JERRY_ATTR_ALWAYS_INLINE |
647 | lit_get_unicode_char_size_by_utf8_first_byte (const lit_utf8_byte_t first_byte) /**< buffer with characters */ |
648 | { |
649 | if ((first_byte & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) |
650 | { |
651 | return 1; |
652 | } |
653 | else if ((first_byte & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER) |
654 | { |
655 | return 2; |
656 | } |
657 | else |
658 | { |
659 | JERRY_ASSERT ((first_byte & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER); |
660 | return 3; |
661 | } |
662 | } /* lit_get_unicode_char_size_by_utf8_first_byte */ |
663 | |
664 | /** |
665 | * Convert code unit to cesu-8 representation |
666 | * |
667 | * @return byte count required to represent the code unit |
668 | */ |
669 | lit_utf8_size_t |
670 | lit_code_unit_to_utf8 (ecma_char_t code_unit, /**< code unit */ |
671 | lit_utf8_byte_t *buf_p) /**< buffer where to store the result and its size |
672 | * should be at least LIT_UTF8_MAX_BYTES_IN_CODE_UNIT */ |
673 | { |
674 | if (code_unit <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) |
675 | { |
676 | buf_p[0] = (lit_utf8_byte_t) code_unit; |
677 | return 1; |
678 | } |
679 | else if (code_unit <= LIT_UTF8_2_BYTE_CODE_POINT_MAX) |
680 | { |
681 | uint32_t code_unit_bits = code_unit; |
682 | lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK); |
683 | code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; |
684 | |
685 | lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_5_BITS_MASK); |
686 | JERRY_ASSERT (first_byte_bits == code_unit_bits); |
687 | |
688 | buf_p[0] = LIT_UTF8_2_BYTE_MARKER | first_byte_bits; |
689 | buf_p[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits; |
690 | return 2; |
691 | } |
692 | else |
693 | { |
694 | uint32_t code_unit_bits = code_unit; |
695 | lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK); |
696 | code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; |
697 | |
698 | lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK); |
699 | code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; |
700 | |
701 | lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_4_BITS_MASK); |
702 | JERRY_ASSERT (first_byte_bits == code_unit_bits); |
703 | |
704 | buf_p[0] = LIT_UTF8_3_BYTE_MARKER | first_byte_bits; |
705 | buf_p[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits; |
706 | buf_p[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits; |
707 | return 3; |
708 | } |
709 | } /* lit_code_unit_to_utf8 */ |
710 | |
711 | /** |
712 | * Convert code point to cesu-8 representation |
713 | * |
714 | * @return byte count required to represent the code point |
715 | */ |
716 | lit_utf8_size_t |
717 | lit_code_point_to_cesu8 (lit_code_point_t code_point, /**< code point */ |
718 | lit_utf8_byte_t *buf) /**< buffer where to store the result, |
719 | * its size should be at least 6 bytes */ |
720 | { |
721 | if (code_point <= LIT_UTF16_CODE_UNIT_MAX) |
722 | { |
723 | return lit_code_unit_to_utf8 ((ecma_char_t) code_point, buf); |
724 | } |
725 | else |
726 | { |
727 | lit_utf8_size_t offset = lit_code_unit_to_utf8 (convert_code_point_to_high_surrogate (code_point), buf); |
728 | offset += lit_code_unit_to_utf8 (convert_code_point_to_low_surrogate (code_point), buf + offset); |
729 | return offset; |
730 | } |
731 | } /* lit_code_point_to_cesu8 */ |
732 | |
733 | /** |
734 | * Convert code point to utf-8 representation |
735 | * |
736 | * @return byte count required to represent the code point |
737 | */ |
738 | lit_utf8_size_t |
739 | lit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */ |
740 | lit_utf8_byte_t *buf) /**< buffer where to store the result, |
741 | * its size should be at least 4 bytes */ |
742 | { |
743 | if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) |
744 | { |
745 | buf[0] = (lit_utf8_byte_t) code_point; |
746 | return 1; |
747 | } |
748 | else if (code_point <= LIT_UTF8_2_BYTE_CODE_POINT_MAX) |
749 | { |
750 | uint32_t code_point_bits = code_point; |
751 | lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK); |
752 | code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; |
753 | |
754 | lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_5_BITS_MASK); |
755 | JERRY_ASSERT (first_byte_bits == code_point_bits); |
756 | |
757 | buf[0] = LIT_UTF8_2_BYTE_MARKER | first_byte_bits; |
758 | buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits; |
759 | return 2; |
760 | } |
761 | else if (code_point <= LIT_UTF8_3_BYTE_CODE_POINT_MAX) |
762 | { |
763 | uint32_t code_point_bits = code_point; |
764 | lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK); |
765 | code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; |
766 | |
767 | lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK); |
768 | code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; |
769 | |
770 | lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_4_BITS_MASK); |
771 | JERRY_ASSERT (first_byte_bits == code_point_bits); |
772 | |
773 | buf[0] = LIT_UTF8_3_BYTE_MARKER | first_byte_bits; |
774 | buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits; |
775 | buf[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits; |
776 | return 3; |
777 | } |
778 | else |
779 | { |
780 | JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MAX); |
781 | |
782 | uint32_t code_point_bits = code_point; |
783 | lit_utf8_byte_t fourth_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK); |
784 | code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; |
785 | |
786 | lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK); |
787 | code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; |
788 | |
789 | lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK); |
790 | code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES; |
791 | |
792 | lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_3_BITS_MASK); |
793 | JERRY_ASSERT (first_byte_bits == code_point_bits); |
794 | |
795 | buf[0] = LIT_UTF8_4_BYTE_MARKER | first_byte_bits; |
796 | buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits; |
797 | buf[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits; |
798 | buf[3] = LIT_UTF8_EXTRA_BYTE_MARKER | fourth_byte_bits; |
799 | return 4; |
800 | } |
801 | } /* lit_code_point_to_utf8 */ |
802 | |
803 | /** |
804 | * Convert cesu-8 string to an utf-8 string and put it into the buffer. |
805 | * It is the caller's responsibility to make sure that the string fits in the buffer. |
806 | * |
807 | * @return number of bytes copied to the buffer. |
808 | */ |
809 | lit_utf8_size_t |
810 | lit_convert_cesu8_string_to_utf8_string (const lit_utf8_byte_t *cesu8_string, /**< cesu-8 string */ |
811 | lit_utf8_size_t cesu8_size, /**< size of cesu-8 string */ |
812 | lit_utf8_byte_t *utf8_string, /**< destination utf-8 buffer pointer |
813 | * (can be NULL if buffer_size == 0) */ |
814 | lit_utf8_size_t utf8_size) /**< size of utf-8 buffer */ |
815 | { |
816 | const lit_utf8_byte_t *cesu8_pos = cesu8_string; |
817 | const lit_utf8_byte_t *cesu8_end_pos = cesu8_string + cesu8_size; |
818 | |
819 | lit_utf8_byte_t *utf8_pos = utf8_string; |
820 | lit_utf8_byte_t *utf8_end_pos = utf8_string + utf8_size; |
821 | |
822 | lit_utf8_size_t size = 0; |
823 | |
824 | ecma_char_t prev_ch = 0; |
825 | lit_utf8_size_t prev_ch_size = 0; |
826 | |
827 | while (cesu8_pos < cesu8_end_pos) |
828 | { |
829 | ecma_char_t ch; |
830 | lit_utf8_size_t code_unit_size = lit_read_code_unit_from_utf8 (cesu8_pos, &ch); |
831 | |
832 | if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch)) |
833 | { |
834 | JERRY_ASSERT (code_unit_size == prev_ch_size); |
835 | utf8_pos -= prev_ch_size; |
836 | lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (prev_ch, ch); |
837 | lit_code_point_to_utf8 (code_point, utf8_pos); |
838 | size++; |
839 | } |
840 | else |
841 | { |
842 | memcpy (utf8_pos, cesu8_pos, code_unit_size); |
843 | size += code_unit_size; |
844 | } |
845 | |
846 | utf8_pos = utf8_string + size; |
847 | cesu8_pos += code_unit_size; |
848 | prev_ch = ch; |
849 | prev_ch_size = code_unit_size; |
850 | } |
851 | |
852 | JERRY_ASSERT (cesu8_pos == cesu8_end_pos); |
853 | JERRY_ASSERT (utf8_pos <= utf8_end_pos); |
854 | |
855 | return size; |
856 | } /* lit_convert_cesu8_string_to_utf8_string */ |
857 | |
858 | /** |
859 | * Convert surrogate pair to code point |
860 | * |
861 | * @return code point |
862 | */ |
863 | lit_code_point_t |
864 | lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, /**< high surrogate code point */ |
865 | ecma_char_t low_surrogate) /**< low surrogate code point */ |
866 | { |
867 | JERRY_ASSERT (lit_is_code_point_utf16_high_surrogate (high_surrogate)); |
868 | JERRY_ASSERT (lit_is_code_point_utf16_low_surrogate (low_surrogate)); |
869 | |
870 | lit_code_point_t code_point; |
871 | code_point = (uint16_t) (high_surrogate - LIT_UTF16_HIGH_SURROGATE_MIN); |
872 | code_point <<= LIT_UTF16_BITS_IN_SURROGATE; |
873 | |
874 | code_point += LIT_UTF16_FIRST_SURROGATE_CODE_POINT; |
875 | |
876 | code_point |= (uint16_t) (low_surrogate - LIT_UTF16_LOW_SURROGATE_MIN); |
877 | return code_point; |
878 | } /* lit_convert_surrogate_pair_to_code_point */ |
879 | |
880 | /** |
881 | * Relational compare of cesu-8 strings |
882 | * |
883 | * First string is less than second string if: |
884 | * - strings are not equal; |
885 | * - first string is prefix of second or is lexicographically less than second. |
886 | * |
887 | * @return true - if first string is less than second string, |
888 | * false - otherwise |
889 | */ |
890 | bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**< utf-8 string */ |
891 | lit_utf8_size_t string1_size, /**< string size */ |
892 | const lit_utf8_byte_t *string2_p, /**< utf-8 string */ |
893 | lit_utf8_size_t string2_size) /**< string size */ |
894 | { |
895 | lit_utf8_byte_t *string1_pos = (lit_utf8_byte_t *) string1_p; |
896 | lit_utf8_byte_t *string2_pos = (lit_utf8_byte_t *) string2_p; |
897 | const lit_utf8_byte_t *string1_end_p = string1_p + string1_size; |
898 | const lit_utf8_byte_t *string2_end_p = string2_p + string2_size; |
899 | |
900 | while (string1_pos < string1_end_p && string2_pos < string2_end_p) |
901 | { |
902 | ecma_char_t ch1, ch2; |
903 | string1_pos += lit_read_code_unit_from_utf8 (string1_pos, &ch1); |
904 | string2_pos += lit_read_code_unit_from_utf8 (string2_pos, &ch2); |
905 | |
906 | if (ch1 < ch2) |
907 | { |
908 | return true; |
909 | } |
910 | else if (ch1 > ch2) |
911 | { |
912 | return false; |
913 | } |
914 | } |
915 | |
916 | return (string1_pos >= string1_end_p && string2_pos < string2_end_p); |
917 | } /* lit_compare_utf8_strings_relational */ |
918 | |