1 | /* Copyright JS Foundation and other contributors, http://js.foundation |
2 | * |
3 | * Licensed under the Apache License, Version 2.0 (the "License"); |
4 | * you may not use this file except in compliance with the License. |
5 | * You may obtain a copy of the License at |
6 | * |
7 | * http://www.apache.org/licenses/LICENSE-2.0 |
8 | * |
9 | * Unless required by applicable law or agreed to in writing, software |
10 | * distributed under the License is distributed on an "AS IS" BASIS |
11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | * See the License for the specific language governing permissions and |
13 | * limitations under the License. |
14 | */ |
15 | |
16 | #include "config.h" |
17 | #include "ecma-helpers.h" |
18 | #include "lit-char-helpers.h" |
19 | #include "lit-strings.h" |
20 | #include "lit-unicode-ranges.inc.h" |
21 | #if ENABLED (JERRY_ESNEXT) |
22 | #include "lit-unicode-ranges-sup.inc.h" |
23 | #endif /* ENABLED (JERRY_ESNEXT) */ |
24 | |
25 | #if ENABLED (JERRY_UNICODE_CASE_CONVERSION) |
26 | #include "lit-unicode-conversions.inc.h" |
27 | #if ENABLED (JERRY_ESNEXT) |
28 | #include "lit-unicode-conversions-sup.inc.h" |
29 | #include "lit-unicode-folding.inc.h" |
30 | #endif /* ENABLED (JERRY_ESNEXT) */ |
31 | #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ |
32 | |
33 | #define NUM_OF_ELEMENTS(array) (sizeof (array) / sizeof ((array)[0])) |
34 | |
35 | /** |
36 | * Binary search algorithm that searches the a |
37 | * character in the given char array. |
38 | * |
39 | * @return true - if the character is in the given array |
40 | * false - otherwise |
41 | */ |
42 | #define LIT_SEARCH_CHAR_IN_ARRAY_FN(function_name, char_type, array_type) \ |
43 | static bool \ |
44 | function_name (char_type c, /**< code unit */ \ |
45 | const array_type *array, /**< array */ \ |
46 | int size_of_array) /**< length of the array */\ |
47 | { \ |
48 | int bottom = 0; \ |
49 | int top = size_of_array - 1; \ |
50 | \ |
51 | while (bottom <= top) \ |
52 | { \ |
53 | int middle = (bottom + top) / 2; \ |
54 | char_type current = array[middle]; \ |
55 | \ |
56 | if (current == c) \ |
57 | { \ |
58 | return true; \ |
59 | } \ |
60 | \ |
61 | if (c < current) \ |
62 | { \ |
63 | top = middle - 1; \ |
64 | } \ |
65 | else \ |
66 | { \ |
67 | bottom = middle + 1; \ |
68 | } \ |
69 | } \ |
70 | \ |
71 | return false; \ |
72 | } /* __function_name */ |
73 | |
74 | LIT_SEARCH_CHAR_IN_ARRAY_FN (lit_search_char_in_array, ecma_char_t, uint16_t) |
75 | |
76 | #if ENABLED (JERRY_ESNEXT) |
77 | LIT_SEARCH_CHAR_IN_ARRAY_FN (lit_search_codepoint_in_array, lit_code_point_t, uint32_t) |
78 | #endif /* ENABLED (JERRY_ESNEXT) */ |
79 | |
80 | /** |
81 | * Binary search algorithm that searches a character in the given intervals. |
82 | * Intervals specifed by two arrays. The first one contains the starting points |
83 | * of the intervals, the second one contains the length of them. |
84 | * |
85 | * @return true - if the the character is included (inclusively) in one of the intervals in the given array |
86 | * false - otherwise |
87 | */ |
88 | #define LIT_SEARCH_CHAR_IN_INTERVAL_ARRAY_FN(function_name, char_type, array_type, interval_type) \ |
89 | static bool \ |
90 | function_name (char_type c, /**< code unit */ \ |
91 | const array_type *array_sp, /**< array of interval starting points */ \ |
92 | const interval_type *lengths, /**< array of interval lengths */ \ |
93 | int size_of_array) /**< length of the array */ \ |
94 | { \ |
95 | int bottom = 0; \ |
96 | int top = size_of_array - 1; \ |
97 | \ |
98 | while (bottom <= top) \ |
99 | { \ |
100 | int middle = (bottom + top) / 2; \ |
101 | char_type current_sp = array_sp[middle]; \ |
102 | \ |
103 | if (current_sp <= c && c <= current_sp + lengths[middle]) \ |
104 | { \ |
105 | return true; \ |
106 | } \ |
107 | \ |
108 | if (c > current_sp) \ |
109 | { \ |
110 | bottom = middle + 1; \ |
111 | } \ |
112 | else \ |
113 | { \ |
114 | top = middle - 1; \ |
115 | } \ |
116 | } \ |
117 | \ |
118 | return false; \ |
119 | } /* function_name */ |
120 | |
121 | LIT_SEARCH_CHAR_IN_INTERVAL_ARRAY_FN (lit_search_char_in_interval_array, ecma_char_t, uint16_t, uint8_t) |
122 | |
123 | #if ENABLED (JERRY_ESNEXT) |
124 | LIT_SEARCH_CHAR_IN_INTERVAL_ARRAY_FN (lit_search_codepoint_in_interval_array, lit_code_point_t, uint32_t, uint16_t) |
125 | #endif /* ENABLED (JERRY_ESNEXT) */ |
126 | |
127 | /** |
128 | * Check if specified character is one of the Whitespace characters including those that fall into |
129 | * "Space, Separator" ("Zs") Unicode character category or one of the Line Terminator characters. |
130 | * |
131 | * @return true - if the character is one of characters, listed in ECMA-262 v5, Table 2, |
132 | * false - otherwise |
133 | */ |
134 | bool |
135 | lit_char_is_white_space (lit_code_point_t c) /**< code point */ |
136 | { |
137 | if (c <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) |
138 | { |
139 | return (c == LIT_CHAR_SP || (c >= LIT_CHAR_TAB && c <= LIT_CHAR_CR)); |
140 | } |
141 | |
142 | if (c == LIT_CHAR_BOM |
143 | #if !ENABLED (JERRY_ESNEXT) |
144 | /* Mongolian Vowel Separator (u180e) used to be a whitespace character. */ |
145 | || c == LIT_CHAR_MVS |
146 | #endif /* !ENABLED (JERRY_ESNEXT) */ |
147 | || c == LIT_CHAR_LS |
148 | || c == LIT_CHAR_PS) |
149 | { |
150 | return true; |
151 | } |
152 | |
153 | return (c <= LIT_UTF16_CODE_UNIT_MAX |
154 | && ((c >= lit_unicode_white_space_interval_starts[0] |
155 | && c <= lit_unicode_white_space_interval_starts[0] + lit_unicode_white_space_interval_lengths[0]) |
156 | || lit_search_char_in_array ((ecma_char_t) c, |
157 | lit_unicode_white_space_chars, |
158 | NUM_OF_ELEMENTS (lit_unicode_white_space_chars)))); |
159 | } /* lit_char_is_white_space */ |
160 | |
161 | /** |
162 | * Check if specified character is one of LineTerminator characters |
163 | * |
164 | * @return true - if the character is one of characters, listed in ECMA-262 v5, Table 3, |
165 | * false - otherwise |
166 | */ |
167 | bool |
168 | lit_char_is_line_terminator (ecma_char_t c) /**< code unit */ |
169 | { |
170 | return (c == LIT_CHAR_LF |
171 | || c == LIT_CHAR_CR |
172 | || c == LIT_CHAR_LS |
173 | || c == LIT_CHAR_PS); |
174 | } /* lit_char_is_line_terminator */ |
175 | |
176 | /** |
177 | * Check if specified character is a Unicode ID_Start |
178 | * |
179 | * See also: |
180 | * ECMA-262 v1, 11.6: UnicodeIDStart |
181 | * |
182 | * @return true - if the codepoint has Unicode property "ID_Start" |
183 | * false - otherwise |
184 | */ |
185 | static bool |
186 | lit_char_is_unicode_id_start (lit_code_point_t code_point) /**< code unit */ |
187 | { |
188 | #if ENABLED (JERRY_ESNEXT) |
189 | if (JERRY_UNLIKELY (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN)) |
190 | { |
191 | return (lit_search_codepoint_in_interval_array (code_point, |
192 | lit_unicode_id_start_interval_starts_sup, |
193 | lit_unicode_id_start_interval_lengths_sup, |
194 | NUM_OF_ELEMENTS (lit_unicode_id_start_interval_starts_sup)) |
195 | || lit_search_codepoint_in_array (code_point, |
196 | lit_unicode_id_start_chars_sup, |
197 | NUM_OF_ELEMENTS (lit_unicode_id_start_chars_sup))); |
198 | } |
199 | #else /* !ENABLED (JERRY_ESNEXT) */ |
200 | JERRY_ASSERT (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN); |
201 | #endif /* ENABLED (JERRY_ESNEXT) */ |
202 | |
203 | ecma_char_t c = (ecma_char_t) code_point; |
204 | |
205 | return (lit_search_char_in_interval_array (c, |
206 | lit_unicode_id_start_interval_starts, |
207 | lit_unicode_id_start_interval_lengths, |
208 | NUM_OF_ELEMENTS (lit_unicode_id_start_interval_starts)) |
209 | || lit_search_char_in_array (c, lit_unicode_id_start_chars, NUM_OF_ELEMENTS (lit_unicode_id_start_chars))); |
210 | } /* lit_char_is_unicode_id_start */ |
211 | |
212 | /** |
213 | * Check if specified character is a Unicode ID_Continue |
214 | * |
215 | * See also: |
216 | * ECMA-262 v1, 11.6: UnicodeIDContinue |
217 | * |
218 | * @return true - if the codepoint has Unicode property "ID_Continue" |
219 | * false - otherwise |
220 | */ |
221 | static bool |
222 | lit_char_is_unicode_id_continue (lit_code_point_t code_point) /**< code unit */ |
223 | { |
224 | /* Each ID_Start codepoint is ID_Continue as well. */ |
225 | if (lit_char_is_unicode_id_start (code_point)) |
226 | { |
227 | return true; |
228 | } |
229 | |
230 | #if ENABLED (JERRY_ESNEXT) |
231 | if (JERRY_UNLIKELY (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN)) |
232 | { |
233 | return (lit_search_codepoint_in_interval_array (code_point, |
234 | lit_unicode_id_continue_interval_starts_sup, |
235 | lit_unicode_id_continue_interval_lengths_sup, |
236 | NUM_OF_ELEMENTS (lit_unicode_id_continue_interval_starts_sup)) |
237 | || lit_search_codepoint_in_array (code_point, |
238 | lit_unicode_id_continue_chars_sup, |
239 | NUM_OF_ELEMENTS (lit_unicode_id_continue_chars_sup))); |
240 | } |
241 | #else /* !ENABLED (JERRY_ESNEXT) */ |
242 | JERRY_ASSERT (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN); |
243 | #endif /* ENABLED (JERRY_ESNEXT) */ |
244 | |
245 | ecma_char_t c = (ecma_char_t) code_point; |
246 | |
247 | return (lit_search_char_in_interval_array (c, |
248 | lit_unicode_id_continue_interval_starts, |
249 | lit_unicode_id_continue_interval_lengths, |
250 | NUM_OF_ELEMENTS (lit_unicode_id_continue_interval_starts)) |
251 | || lit_search_char_in_array (c, |
252 | lit_unicode_id_continue_chars, |
253 | NUM_OF_ELEMENTS (lit_unicode_id_continue_chars))); |
254 | } /* lit_char_is_unicode_id_continue */ |
255 | |
256 | /** |
257 | * Checks whether the character is a valid identifier start. |
258 | * |
259 | * @return true if it is. |
260 | */ |
261 | bool |
262 | lit_code_point_is_identifier_start (lit_code_point_t code_point) /**< code point */ |
263 | { |
264 | /* Fast path for ASCII-defined letters. */ |
265 | if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) |
266 | { |
267 | return ((LEXER_TO_ASCII_LOWERCASE (code_point) >= LIT_CHAR_LOWERCASE_A |
268 | && LEXER_TO_ASCII_LOWERCASE (code_point) <= LIT_CHAR_LOWERCASE_Z) |
269 | || code_point == LIT_CHAR_DOLLAR_SIGN |
270 | || code_point == LIT_CHAR_UNDERSCORE); |
271 | } |
272 | |
273 | return lit_char_is_unicode_id_start (code_point); |
274 | } /* lit_code_point_is_identifier_start */ |
275 | |
276 | /** |
277 | * Checks whether the character is a valid identifier part. |
278 | * |
279 | * @return true if it is. |
280 | */ |
281 | bool |
282 | lit_code_point_is_identifier_part (lit_code_point_t code_point) /**< code point */ |
283 | { |
284 | /* Fast path for ASCII-defined letters. */ |
285 | if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) |
286 | { |
287 | return ((LEXER_TO_ASCII_LOWERCASE (code_point) >= LIT_CHAR_LOWERCASE_A |
288 | && LEXER_TO_ASCII_LOWERCASE (code_point) <= LIT_CHAR_LOWERCASE_Z) |
289 | || (code_point >= LIT_CHAR_0 && code_point <= LIT_CHAR_9) |
290 | || code_point == LIT_CHAR_DOLLAR_SIGN |
291 | || code_point == LIT_CHAR_UNDERSCORE); |
292 | } |
293 | |
294 | return lit_char_is_unicode_id_continue (code_point); |
295 | } /* lit_code_point_is_identifier_part */ |
296 | |
297 | /** |
298 | * Check if specified character is one of OctalDigit characters (ECMA-262 v5, B.1.2) |
299 | * |
300 | * @return true / false |
301 | */ |
302 | bool |
303 | lit_char_is_octal_digit (ecma_char_t c) /**< code unit */ |
304 | { |
305 | return (c >= LIT_CHAR_ASCII_OCTAL_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_OCTAL_DIGITS_END); |
306 | } /* lit_char_is_octal_digit */ |
307 | |
308 | /** |
309 | * Check if specified character is one of DecimalDigit characters (ECMA-262 v5, 7.8.3) |
310 | * |
311 | * @return true / false |
312 | */ |
313 | bool |
314 | lit_char_is_decimal_digit (ecma_char_t c) /**< code unit */ |
315 | { |
316 | return (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END); |
317 | } /* lit_char_is_decimal_digit */ |
318 | |
319 | /** |
320 | * Check if specified character is one of HexDigit characters (ECMA-262 v5, 7.8.3) |
321 | * |
322 | * @return true / false |
323 | */ |
324 | bool |
325 | lit_char_is_hex_digit (ecma_char_t c) /**< code unit */ |
326 | { |
327 | return ((c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END) |
328 | || (LEXER_TO_ASCII_LOWERCASE (c) >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN |
329 | && LEXER_TO_ASCII_LOWERCASE (c) <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END)); |
330 | } /* lit_char_is_hex_digit */ |
331 | |
332 | #if ENABLED (JERRY_ESNEXT) |
333 | /** |
334 | * Check if specified character is one of BinaryDigits characters (ECMA-262 v6, 11.8.3) |
335 | * |
336 | * @return true / false |
337 | */ |
338 | bool |
339 | lit_char_is_binary_digit (ecma_char_t c) /** code unit */ |
340 | { |
341 | return (c == LIT_CHAR_0 || c == LIT_CHAR_1); |
342 | } /* lit_char_is_binary_digit */ |
343 | #endif /* ENABLED (JERRY_ESNEXT) */ |
344 | |
345 | /** |
346 | * UnicodeEscape abstract method |
347 | * |
348 | * See also: ECMA-262 v10, 24.5.2.3 |
349 | */ |
350 | void |
351 | lit_char_unicode_escape (ecma_stringbuilder_t *builder_p, /**< stringbuilder to append */ |
352 | ecma_char_t c) /**< code unit to convert */ |
353 | { |
354 | ecma_stringbuilder_append_raw (builder_p, (lit_utf8_byte_t *) "\\u" , 2); |
355 | |
356 | for (int8_t i = 3; i >= 0; i--) |
357 | { |
358 | int32_t result_char = (c >> (i * 4)) & 0xF; |
359 | ecma_stringbuilder_append_byte (builder_p, (lit_utf8_byte_t) (result_char + (result_char <= 9 |
360 | ? LIT_CHAR_0 |
361 | : (LIT_CHAR_LOWERCASE_A - 10)))); |
362 | } |
363 | } /* lit_char_unicode_escape */ |
364 | |
365 | /** |
366 | * Convert a HexDigit character to its numeric value, as defined in ECMA-262 v5, 7.8.3 |
367 | * |
368 | * @return digit value, corresponding to the hex char |
369 | */ |
370 | uint32_t |
371 | lit_char_hex_to_int (ecma_char_t c) /**< code unit, corresponding to |
372 | * one of HexDigit characters */ |
373 | { |
374 | JERRY_ASSERT (lit_char_is_hex_digit (c)); |
375 | |
376 | if (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END) |
377 | { |
378 | return (uint32_t) (c - LIT_CHAR_ASCII_DIGITS_BEGIN); |
379 | } |
380 | else if (c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END) |
381 | { |
382 | return (uint32_t) (c - LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN + 10); |
383 | } |
384 | else |
385 | { |
386 | return (uint32_t) (c - LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN + 10); |
387 | } |
388 | } /* lit_char_hex_to_int */ |
389 | |
390 | /** |
391 | * Converts a character to UTF8 bytes. |
392 | * |
393 | * @return length of the UTF8 representation. |
394 | */ |
395 | size_t |
396 | lit_code_point_to_cesu8_bytes (uint8_t *dst_p, /**< destination buffer */ |
397 | lit_code_point_t code_point) /**< code point */ |
398 | { |
399 | if (code_point < LIT_UTF8_2_BYTE_CODE_POINT_MIN) |
400 | { |
401 | /* 00000000 0xxxxxxx -> 0xxxxxxx */ |
402 | dst_p[0] = (uint8_t) code_point; |
403 | return 1; |
404 | } |
405 | |
406 | if (code_point < LIT_UTF8_3_BYTE_CODE_POINT_MIN) |
407 | { |
408 | /* 00000yyy yyxxxxxx -> 110yyyyy 10xxxxxx */ |
409 | dst_p[0] = (uint8_t) (LIT_UTF8_2_BYTE_MARKER | ((code_point >> 6) & LIT_UTF8_LAST_5_BITS_MASK)); |
410 | dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK)); |
411 | return 2; |
412 | } |
413 | |
414 | if (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN) |
415 | { |
416 | /* zzzzyyyy yyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx */ |
417 | dst_p[0] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | ((code_point >> 12) & LIT_UTF8_LAST_4_BITS_MASK)); |
418 | dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | ((code_point >> 6) & LIT_UTF8_LAST_6_BITS_MASK)); |
419 | dst_p[2] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK)); |
420 | return 3; |
421 | } |
422 | |
423 | JERRY_ASSERT (code_point <= LIT_UNICODE_CODE_POINT_MAX); |
424 | |
425 | code_point -= LIT_UTF8_4_BYTE_CODE_POINT_MIN; |
426 | |
427 | dst_p[0] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | 0xd); |
428 | dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | 0x20 | ((code_point >> 16) & LIT_UTF8_LAST_4_BITS_MASK)); |
429 | dst_p[2] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | ((code_point >> 10) & LIT_UTF8_LAST_6_BITS_MASK)); |
430 | |
431 | dst_p[3] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | 0xd); |
432 | dst_p[4] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | 0x30 | ((code_point >> 6) & LIT_UTF8_LAST_4_BITS_MASK)); |
433 | dst_p[5] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK)); |
434 | |
435 | return 3 * 2; |
436 | } /* lit_code_point_to_cesu8_bytes */ |
437 | |
438 | /** |
439 | * Returns the length of the UTF8 representation of a character. |
440 | * |
441 | * @return length of the UTF8 representation. |
442 | */ |
443 | size_t |
444 | lit_code_point_get_cesu8_length (lit_code_point_t code_point) /**< code point */ |
445 | { |
446 | if (code_point < LIT_UTF8_2_BYTE_CODE_POINT_MIN) |
447 | { |
448 | /* 00000000 0xxxxxxx */ |
449 | return 1; |
450 | } |
451 | |
452 | if (code_point < LIT_UTF8_3_BYTE_CODE_POINT_MIN) |
453 | { |
454 | /* 00000yyy yyxxxxxx */ |
455 | return 2; |
456 | } |
457 | |
458 | if (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN) |
459 | { |
460 | /* zzzzyyyy yyxxxxxx */ |
461 | return 3; |
462 | } |
463 | |
464 | /* high + low surrogate */ |
465 | return 2 * 3; |
466 | } /* lit_code_point_get_cesu8_length */ |
467 | |
468 | /** |
469 | * Convert a four byte long utf8 character to two three byte long cesu8 characters |
470 | */ |
471 | void |
472 | lit_four_byte_utf8_char_to_cesu8 (uint8_t *dst_p, /**< destination buffer */ |
473 | const uint8_t *source_p) /**< source buffer */ |
474 | { |
475 | lit_code_point_t code_point = ((((uint32_t) source_p[0]) & LIT_UTF8_LAST_3_BITS_MASK) << 18); |
476 | code_point |= ((((uint32_t) source_p[1]) & LIT_UTF8_LAST_6_BITS_MASK) << 12); |
477 | code_point |= ((((uint32_t) source_p[2]) & LIT_UTF8_LAST_6_BITS_MASK) << 6); |
478 | code_point |= (((uint32_t) source_p[3]) & LIT_UTF8_LAST_6_BITS_MASK); |
479 | |
480 | lit_code_point_to_cesu8_bytes (dst_p, code_point); |
481 | } /* lit_four_byte_utf8_char_to_cesu8 */ |
482 | |
483 | /** |
484 | * Lookup hex digits in a buffer |
485 | * |
486 | * @return UINT32_MAX - if next 'lookup' number of characters do not form a valid hex number |
487 | * value of hex number, otherwise |
488 | */ |
489 | uint32_t |
490 | lit_char_hex_lookup (const lit_utf8_byte_t *buf_p, /**< buffer */ |
491 | const lit_utf8_byte_t *const buf_end_p, /**< buffer end */ |
492 | uint32_t lookup) /**< size of lookup */ |
493 | { |
494 | JERRY_ASSERT (lookup <= 4); |
495 | |
496 | if (JERRY_UNLIKELY (buf_p + lookup > buf_end_p)) |
497 | { |
498 | return UINT32_MAX; |
499 | } |
500 | |
501 | uint32_t value = 0; |
502 | |
503 | while (lookup--) |
504 | { |
505 | lit_utf8_byte_t ch = *buf_p++; |
506 | if (!lit_char_is_hex_digit (ch)) |
507 | { |
508 | return UINT32_MAX; |
509 | } |
510 | |
511 | value <<= 4; |
512 | value += lit_char_hex_to_int (ch); |
513 | } |
514 | |
515 | JERRY_ASSERT (value <= LIT_UTF16_CODE_UNIT_MAX); |
516 | return value; |
517 | } /* lit_char_hex_lookup */ |
518 | |
519 | /** |
520 | * Parse a decimal number with the value clamped to UINT32_MAX. |
521 | * |
522 | * @returns uint32_t number |
523 | */ |
524 | uint32_t |
525 | lit_parse_decimal (const lit_utf8_byte_t **buffer_p, /**< [in/out] character buffer */ |
526 | const lit_utf8_byte_t *buffer_end_p) /**< buffer end */ |
527 | { |
528 | const lit_utf8_byte_t *current_p = *buffer_p; |
529 | JERRY_ASSERT (lit_char_is_decimal_digit (*current_p)); |
530 | |
531 | uint32_t value = (uint32_t) (*current_p++ - LIT_CHAR_0); |
532 | |
533 | while (current_p < buffer_end_p && lit_char_is_decimal_digit (*current_p)) |
534 | { |
535 | const uint32_t digit = (uint32_t) (*current_p++ - LIT_CHAR_0); |
536 | uint32_t new_value = value * 10 + digit; |
537 | |
538 | if (JERRY_UNLIKELY (value > UINT32_MAX / 10) || JERRY_UNLIKELY (new_value < value)) |
539 | { |
540 | value = UINT32_MAX; |
541 | continue; |
542 | } |
543 | |
544 | value = new_value; |
545 | } |
546 | |
547 | *buffer_p = current_p; |
548 | return value; |
549 | } /* lit_parse_decimal */ |
550 | |
551 | /** |
552 | * Check if specified character is a word character (part of IsWordChar abstract operation) |
553 | * |
554 | * See also: ECMA-262 v5, 15.10.2.6 (IsWordChar) |
555 | * |
556 | * @return true - if the character is a word character |
557 | * false - otherwise |
558 | */ |
559 | bool |
560 | lit_char_is_word_char (lit_code_point_t c) /**< code point */ |
561 | { |
562 | return ((c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END) |
563 | || (c >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END) |
564 | || (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END) |
565 | || c == LIT_CHAR_UNDERSCORE); |
566 | } /* lit_char_is_word_char */ |
567 | |
568 | #if ENABLED (JERRY_UNICODE_CASE_CONVERSION) |
569 | |
570 | /** |
571 | * Check if the specified character is in one of those tables which contain bidirectional conversions. |
572 | * |
573 | * @return codepoint of the converted character if it is found the the tables |
574 | * LIT_INVALID_CP - otherwise. |
575 | */ |
576 | static lit_code_point_t |
577 | lit_search_in_bidirectional_conversion_tables (lit_code_point_t cp, /**< code point */ |
578 | bool is_lowercase) /**< is lowercase conversion */ |
579 | { |
580 | /* 1, Check if the specified character is part of the lit_unicode_character_case_ranges_{sup} table. */ |
581 | int number_of_case_ranges; |
582 | #if ENABLED (JERRY_ESNEXT) |
583 | bool is_supplementary = cp > LIT_UTF16_CODE_UNIT_MAX; |
584 | if (is_supplementary) |
585 | { |
586 | number_of_case_ranges = NUM_OF_ELEMENTS (lit_unicode_character_case_ranges_sup); |
587 | } |
588 | else |
589 | #endif /* ENABLED (JERRY_ESNEXT) */ |
590 | { |
591 | number_of_case_ranges = NUM_OF_ELEMENTS (lit_unicode_character_case_ranges); |
592 | } |
593 | |
594 | int conv_counter = 0; |
595 | |
596 | for (int i = 0; i < number_of_case_ranges; i++) |
597 | { |
598 | if (i % 2 == 0 && i > 0) |
599 | { |
600 | conv_counter++; |
601 | } |
602 | |
603 | size_t range_length; |
604 | lit_code_point_t start_point; |
605 | #if ENABLED (JERRY_ESNEXT) |
606 | if (is_supplementary) |
607 | { |
608 | range_length = lit_unicode_character_case_range_lengths_sup[conv_counter]; |
609 | start_point = lit_unicode_character_case_ranges_sup[i]; |
610 | } |
611 | else |
612 | #endif /* ENABLED (JERRY_ESNEXT) */ |
613 | { |
614 | range_length = lit_unicode_character_case_range_lengths[conv_counter]; |
615 | start_point = lit_unicode_character_case_ranges[i]; |
616 | } |
617 | |
618 | if (start_point > cp || cp >= start_point + range_length) |
619 | { |
620 | continue; |
621 | } |
622 | |
623 | uint32_t char_dist = (uint32_t) cp - start_point; |
624 | int offset; |
625 | if (i % 2 == 0) |
626 | { |
627 | if (!is_lowercase) |
628 | { |
629 | return cp; |
630 | } |
631 | |
632 | offset = i + 1; |
633 | } |
634 | else |
635 | { |
636 | if (is_lowercase) |
637 | { |
638 | return cp; |
639 | } |
640 | |
641 | offset = i - 1; |
642 | } |
643 | |
644 | #if ENABLED (JERRY_ESNEXT) |
645 | if (is_supplementary) |
646 | { |
647 | start_point = lit_unicode_character_case_ranges_sup[offset]; |
648 | } |
649 | else |
650 | #endif /* ENABLED (JERRY_ESNEXT) */ |
651 | { |
652 | start_point = lit_unicode_character_case_ranges[offset]; |
653 | } |
654 | |
655 | return (lit_code_point_t) (start_point + char_dist); |
656 | } |
657 | |
658 | /* Note: After this point based on the latest unicode standard(13.0.0.6) no conversion characters are |
659 | defined for supplementary planes */ |
660 | #if ENABLED (JERRY_ESNEXT) |
661 | if (is_supplementary) |
662 | { |
663 | return cp; |
664 | } |
665 | #endif /* ENABLED (JERRY_ESNEXT) */ |
666 | |
667 | /* 2, Check if the specified character is part of the character_pair_ranges table. */ |
668 | int bottom = 0; |
669 | int top = NUM_OF_ELEMENTS (lit_unicode_character_pair_ranges) - 1; |
670 | |
671 | while (bottom <= top) |
672 | { |
673 | int middle = (bottom + top) / 2; |
674 | lit_code_point_t current_sp = lit_unicode_character_pair_ranges[middle]; |
675 | |
676 | if (current_sp <= cp && cp < current_sp + lit_unicode_character_pair_range_lengths[middle]) |
677 | { |
678 | uint32_t char_dist = (uint32_t) (cp - current_sp); |
679 | |
680 | if ((cp - current_sp) % 2 == 0) |
681 | { |
682 | return is_lowercase ? (lit_code_point_t) (current_sp + char_dist + 1) : cp; |
683 | } |
684 | |
685 | return is_lowercase ? cp : (lit_code_point_t) (current_sp + char_dist - 1); |
686 | } |
687 | |
688 | if (cp > current_sp) |
689 | { |
690 | bottom = middle + 1; |
691 | } |
692 | else |
693 | { |
694 | top = middle - 1; |
695 | } |
696 | } |
697 | |
698 | /* 3, Check if the specified character is part of the character_pairs table. */ |
699 | int number_of_character_pairs = NUM_OF_ELEMENTS (lit_unicode_character_pairs); |
700 | |
701 | for (int i = 0; i < number_of_character_pairs; i++) |
702 | { |
703 | if (cp != lit_unicode_character_pairs[i]) |
704 | { |
705 | continue; |
706 | } |
707 | |
708 | if (i % 2 == 0) |
709 | { |
710 | return is_lowercase ? lit_unicode_character_pairs[i + 1] : cp; |
711 | } |
712 | |
713 | return is_lowercase ? cp : lit_unicode_character_pairs[i - 1]; |
714 | } |
715 | |
716 | return LIT_INVALID_CP; |
717 | } /* lit_search_in_bidirectional_conversion_tables */ |
718 | |
719 | /** |
720 | * Check if the specified character is in the given conversion table. |
721 | * |
722 | * @return LIT_MULTIPLE_CU if the converted character consist more than a single code unit |
723 | * converted code point - otherwise |
724 | */ |
725 | static lit_code_point_t |
726 | lit_search_in_conversion_table (ecma_char_t character, /**< code unit */ |
727 | ecma_stringbuilder_t *builder_p, /**< string builder */ |
728 | const ecma_char_t *array, /**< array */ |
729 | const uint8_t *counters) /**< case_values counter */ |
730 | { |
731 | int end_point = 0; |
732 | |
733 | for (int i = 0; i < 3; i++) |
734 | { |
735 | int start_point = end_point; |
736 | int size_of_case_value = i + 1; |
737 | end_point += counters[i] * (size_of_case_value + 1); |
738 | |
739 | int bottom = start_point; |
740 | int top = end_point - size_of_case_value; |
741 | |
742 | while (bottom <= top) |
743 | { |
744 | int middle = (bottom + top) / 2; |
745 | |
746 | middle -= ((middle - bottom) % (size_of_case_value + 1)); |
747 | |
748 | ecma_char_t current = array[middle]; |
749 | |
750 | if (current == character) |
751 | { |
752 | if (builder_p != NULL) |
753 | { |
754 | ecma_stringbuilder_append_char (builder_p, array[middle + 1]); |
755 | |
756 | if (size_of_case_value > 1) |
757 | { |
758 | ecma_stringbuilder_append_char (builder_p, array[middle + 2]); |
759 | } |
760 | if (size_of_case_value > 2) |
761 | { |
762 | ecma_stringbuilder_append_char (builder_p, array[middle + 3]); |
763 | } |
764 | } |
765 | |
766 | return size_of_case_value == 1 ? array[middle + 1]: LIT_MULTIPLE_CU; |
767 | } |
768 | |
769 | if (character < current) |
770 | { |
771 | top = middle - (size_of_case_value + 1); |
772 | } |
773 | else |
774 | { |
775 | bottom = middle + (size_of_case_value + 1); |
776 | } |
777 | } |
778 | } |
779 | |
780 | if (builder_p != NULL) |
781 | { |
782 | ecma_stringbuilder_append_char (builder_p, character); |
783 | } |
784 | |
785 | return (lit_code_point_t) character; |
786 | } /* lit_search_in_conversion_table */ |
787 | #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ |
788 | |
789 | /** |
790 | * Append the converted lowercase codeunit sequence of an a given codepoint into the stringbuilder if it is present. |
791 | * |
792 | * @return LIT_MULTIPLE_CU if the converted codepoint consist more than a single code unit |
793 | * converted code point - otherwise |
794 | */ |
795 | lit_code_point_t |
796 | lit_char_to_lower_case (lit_code_point_t cp, /**< code point */ |
797 | ecma_stringbuilder_t *builder_p) /**< string builder */ |
798 | { |
799 | if (cp <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) |
800 | { |
801 | if (cp >= LIT_CHAR_UPPERCASE_A && cp <= LIT_CHAR_UPPERCASE_Z) |
802 | { |
803 | cp = (lit_utf8_byte_t) (cp + (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A)); |
804 | } |
805 | |
806 | if (builder_p != NULL) |
807 | { |
808 | ecma_stringbuilder_append_byte (builder_p, (lit_utf8_byte_t) cp); |
809 | } |
810 | |
811 | return cp; |
812 | } |
813 | |
814 | #if ENABLED (JERRY_UNICODE_CASE_CONVERSION) |
815 | lit_code_point_t lowercase_cp = lit_search_in_bidirectional_conversion_tables (cp, true); |
816 | |
817 | if (lowercase_cp != LIT_INVALID_CP) |
818 | { |
819 | if (builder_p != NULL) |
820 | { |
821 | ecma_stringbuilder_append_codepoint (builder_p, lowercase_cp); |
822 | } |
823 | |
824 | return lowercase_cp; |
825 | } |
826 | |
827 | JERRY_ASSERT (cp < LIT_UTF8_4_BYTE_CODE_POINT_MIN); |
828 | |
829 | int num_of_lowercase_ranges = NUM_OF_ELEMENTS (lit_unicode_lower_case_ranges); |
830 | |
831 | for (int i = 0, j = 0; i < num_of_lowercase_ranges; i += 2, j++) |
832 | { |
833 | JERRY_ASSERT (lit_unicode_lower_case_range_lengths[j] > 0); |
834 | uint32_t range_length = (uint32_t) (lit_unicode_lower_case_range_lengths[j] - 1); |
835 | lit_code_point_t start_point = lit_unicode_lower_case_ranges[i]; |
836 | |
837 | if (start_point <= cp && cp <= start_point + range_length) |
838 | { |
839 | lowercase_cp = lit_unicode_lower_case_ranges[i + 1] + (cp - start_point); |
840 | if (builder_p != NULL) |
841 | { |
842 | ecma_stringbuilder_append_codepoint (builder_p, lowercase_cp); |
843 | } |
844 | |
845 | return lowercase_cp; |
846 | } |
847 | } |
848 | |
849 | return lit_search_in_conversion_table ((ecma_char_t) cp, |
850 | builder_p, |
851 | lit_unicode_lower_case_conversions, |
852 | lit_unicode_lower_case_conversion_counters); |
853 | #else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ |
854 | if (builder_p != NULL) |
855 | { |
856 | ecma_stringbuilder_append_codepoint (builder_p, cp); |
857 | } |
858 | |
859 | return cp; |
860 | #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ |
861 | } /* lit_char_to_lower_case */ |
862 | |
863 | /** |
864 | * Append the converted uppercase codeunit sequence of an a given codepoint into the stringbuilder if it is present. |
865 | * |
866 | * @return LIT_MULTIPLE_CU if the converted codepoint consist more than a single code unit |
867 | * converted code point - otherwise |
868 | */ |
869 | lit_code_point_t |
870 | lit_char_to_upper_case (lit_code_point_t cp, /**< code point */ |
871 | ecma_stringbuilder_t *builder_p) /**< string builder */ |
872 | { |
873 | if (cp <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) |
874 | { |
875 | if (cp >= LIT_CHAR_LOWERCASE_A && cp <= LIT_CHAR_LOWERCASE_Z) |
876 | { |
877 | cp = (lit_utf8_byte_t) (cp - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A)); |
878 | } |
879 | |
880 | if (builder_p != NULL) |
881 | { |
882 | ecma_stringbuilder_append_byte (builder_p, (lit_utf8_byte_t) cp); |
883 | } |
884 | |
885 | return cp; |
886 | } |
887 | |
888 | #if ENABLED (JERRY_UNICODE_CASE_CONVERSION) |
889 | lit_code_point_t uppercase_cp = lit_search_in_bidirectional_conversion_tables (cp, false); |
890 | |
891 | if (uppercase_cp != LIT_INVALID_CP) |
892 | { |
893 | if (builder_p != NULL) |
894 | { |
895 | ecma_stringbuilder_append_codepoint (builder_p, uppercase_cp); |
896 | } |
897 | |
898 | return uppercase_cp; |
899 | } |
900 | |
901 | int num_of_upper_case_special_ranges = NUM_OF_ELEMENTS (lit_unicode_upper_case_special_ranges); |
902 | |
903 | for (int i = 0, j = 0; i < num_of_upper_case_special_ranges; i += 3, j++) |
904 | { |
905 | uint32_t range_length = lit_unicode_upper_case_special_range_lengths[j]; |
906 | ecma_char_t start_point = lit_unicode_upper_case_special_ranges[i]; |
907 | |
908 | if (start_point <= cp && cp <= start_point + range_length) |
909 | { |
910 | if (builder_p != NULL) |
911 | { |
912 | uppercase_cp = lit_unicode_upper_case_special_ranges[i + 1] + (cp - start_point); |
913 | ecma_stringbuilder_append_codepoint (builder_p, uppercase_cp); |
914 | ecma_stringbuilder_append_codepoint (builder_p, lit_unicode_upper_case_special_ranges[i + 2]); |
915 | } |
916 | |
917 | return LIT_MULTIPLE_CU; |
918 | } |
919 | } |
920 | |
921 | return lit_search_in_conversion_table ((ecma_char_t) cp, |
922 | builder_p, |
923 | lit_unicode_upper_case_conversions, |
924 | lit_unicode_upper_case_conversion_counters); |
925 | #else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ |
926 | if (builder_p != NULL) |
927 | { |
928 | ecma_stringbuilder_append_codepoint (builder_p, cp); |
929 | } |
930 | |
931 | return cp; |
932 | #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ |
933 | } /* lit_char_to_upper_case */ |
934 | |
935 | #if ENABLED (JERRY_ESNEXT) |
936 | /* |
937 | * Look up whether the character should be folded to the lowercase variant. |
938 | * |
939 | * @return true, if character should be lowercased |
940 | * false, otherwise |
941 | */ |
942 | bool |
943 | lit_char_fold_to_lower (lit_code_point_t cp) /**< code point */ |
944 | { |
945 | #if ENABLED (JERRY_UNICODE_CASE_CONVERSION) |
946 | return (cp <= LIT_UTF8_1_BYTE_CODE_POINT_MAX |
947 | || cp > LIT_UTF16_CODE_UNIT_MAX |
948 | || (!lit_search_char_in_interval_array ((ecma_char_t) cp, |
949 | lit_unicode_folding_skip_to_lower_interval_starts, |
950 | lit_unicode_folding_skip_to_lower_interval_lengths, |
951 | NUM_OF_ELEMENTS (lit_unicode_folding_skip_to_lower_interval_starts)) |
952 | && !lit_search_char_in_array ((ecma_char_t) cp, |
953 | lit_unicode_folding_skip_to_lower_chars, |
954 | NUM_OF_ELEMENTS (lit_unicode_folding_skip_to_lower_chars)))); |
955 | #else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ |
956 | return true; |
957 | #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ |
958 | } /* lit_char_fold_to_lower */ |
959 | |
960 | /* |
961 | * Look up whether the character should be folded to the uppercase variant. |
962 | * |
963 | * @return true, if character should be uppercased |
964 | * false, otherwise |
965 | */ |
966 | bool |
967 | lit_char_fold_to_upper (lit_code_point_t cp) /**< code point */ |
968 | { |
969 | #if ENABLED (JERRY_UNICODE_CASE_CONVERSION) |
970 | return (cp > LIT_UTF8_1_BYTE_CODE_POINT_MAX |
971 | && cp <= LIT_UTF16_CODE_UNIT_MAX |
972 | && (lit_search_char_in_interval_array ((ecma_char_t) cp, |
973 | lit_unicode_folding_to_upper_interval_starts, |
974 | lit_unicode_folding_to_upper_interval_lengths, |
975 | NUM_OF_ELEMENTS (lit_unicode_folding_to_upper_interval_starts)) |
976 | || lit_search_char_in_array ((ecma_char_t) cp, |
977 | lit_unicode_folding_to_upper_chars, |
978 | NUM_OF_ELEMENTS (lit_unicode_folding_to_upper_chars)))); |
979 | #else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ |
980 | return false; |
981 | #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ |
982 | } /* lit_char_fold_to_upper */ |
983 | #endif /* ENABLED (JERRY_ESNEXT) */ |
984 | |
985 | /** |
986 | * Helper method to find a specific character in a string |
987 | * |
988 | * Used by: |
989 | * ecma_builtin_string_prototype_object_replace_helper |
990 | * |
991 | * @return true - if the given character is in the string |
992 | * false - otherwise |
993 | */ |
994 | bool |
995 | lit_find_char_in_string (ecma_string_t *str_p, /**< source string */ |
996 | lit_utf8_byte_t c) /**< character to find*/ |
997 | { |
998 | ECMA_STRING_TO_UTF8_STRING (str_p, start_p, start_size); |
999 | |
1000 | const lit_utf8_byte_t *str_curr_p = start_p; |
1001 | const lit_utf8_byte_t *str_end_p = start_p + start_size; |
1002 | bool have_char = false; |
1003 | |
1004 | while (str_curr_p < str_end_p) |
1005 | { |
1006 | if (*str_curr_p++ == c) |
1007 | { |
1008 | have_char = true; |
1009 | break; |
1010 | } |
1011 | } |
1012 | |
1013 | ECMA_FINALIZE_UTF8_STRING (start_p, start_size); |
1014 | |
1015 | return have_char; |
1016 | } /* lit_find_char_in_string */ |
1017 | |