1/* Copyright JS Foundation and other contributors, http://js.foundation
2 *
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#include "config.h"
17#include "ecma-helpers.h"
18#include "lit-char-helpers.h"
19#include "lit-strings.h"
20#include "lit-unicode-ranges.inc.h"
21#if ENABLED (JERRY_ESNEXT)
22#include "lit-unicode-ranges-sup.inc.h"
23#endif /* ENABLED (JERRY_ESNEXT) */
24
25#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
26#include "lit-unicode-conversions.inc.h"
27#if ENABLED (JERRY_ESNEXT)
28#include "lit-unicode-conversions-sup.inc.h"
29#include "lit-unicode-folding.inc.h"
30#endif /* ENABLED (JERRY_ESNEXT) */
31#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
32
33#define NUM_OF_ELEMENTS(array) (sizeof (array) / sizeof ((array)[0]))
34
35/**
36 * Binary search algorithm that searches the a
37 * character in the given char array.
38 *
39 * @return true - if the character is in the given array
40 * false - otherwise
41 */
42#define LIT_SEARCH_CHAR_IN_ARRAY_FN(function_name, char_type, array_type) \
43static bool \
44function_name (char_type c, /**< code unit */ \
45 const array_type *array, /**< array */ \
46 int size_of_array) /**< length of the array */\
47{ \
48 int bottom = 0; \
49 int top = size_of_array - 1; \
50 \
51 while (bottom <= top) \
52 { \
53 int middle = (bottom + top) / 2; \
54 char_type current = array[middle]; \
55 \
56 if (current == c) \
57 { \
58 return true; \
59 } \
60 \
61 if (c < current) \
62 { \
63 top = middle - 1; \
64 } \
65 else \
66 { \
67 bottom = middle + 1; \
68 } \
69 } \
70 \
71 return false; \
72} /* __function_name */
73
74LIT_SEARCH_CHAR_IN_ARRAY_FN (lit_search_char_in_array, ecma_char_t, uint16_t)
75
76#if ENABLED (JERRY_ESNEXT)
77LIT_SEARCH_CHAR_IN_ARRAY_FN (lit_search_codepoint_in_array, lit_code_point_t, uint32_t)
78#endif /* ENABLED (JERRY_ESNEXT) */
79
80/**
81 * Binary search algorithm that searches a character in the given intervals.
82 * Intervals specifed by two arrays. The first one contains the starting points
83 * of the intervals, the second one contains the length of them.
84 *
85 * @return true - if the the character is included (inclusively) in one of the intervals in the given array
86 * false - otherwise
87 */
88#define LIT_SEARCH_CHAR_IN_INTERVAL_ARRAY_FN(function_name, char_type, array_type, interval_type) \
89static bool \
90function_name (char_type c, /**< code unit */ \
91 const array_type *array_sp, /**< array of interval starting points */ \
92 const interval_type *lengths, /**< array of interval lengths */ \
93 int size_of_array) /**< length of the array */ \
94{ \
95 int bottom = 0; \
96 int top = size_of_array - 1; \
97 \
98 while (bottom <= top) \
99 { \
100 int middle = (bottom + top) / 2; \
101 char_type current_sp = array_sp[middle]; \
102 \
103 if (current_sp <= c && c <= current_sp + lengths[middle]) \
104 { \
105 return true; \
106 } \
107 \
108 if (c > current_sp) \
109 { \
110 bottom = middle + 1; \
111 } \
112 else \
113 { \
114 top = middle - 1; \
115 } \
116 } \
117 \
118 return false; \
119} /* function_name */
120
121LIT_SEARCH_CHAR_IN_INTERVAL_ARRAY_FN (lit_search_char_in_interval_array, ecma_char_t, uint16_t, uint8_t)
122
123#if ENABLED (JERRY_ESNEXT)
124LIT_SEARCH_CHAR_IN_INTERVAL_ARRAY_FN (lit_search_codepoint_in_interval_array, lit_code_point_t, uint32_t, uint16_t)
125#endif /* ENABLED (JERRY_ESNEXT) */
126
127/**
128 * Check if specified character is one of the Whitespace characters including those that fall into
129 * "Space, Separator" ("Zs") Unicode character category or one of the Line Terminator characters.
130 *
131 * @return true - if the character is one of characters, listed in ECMA-262 v5, Table 2,
132 * false - otherwise
133 */
134bool
135lit_char_is_white_space (lit_code_point_t c) /**< code point */
136{
137 if (c <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
138 {
139 return (c == LIT_CHAR_SP || (c >= LIT_CHAR_TAB && c <= LIT_CHAR_CR));
140 }
141
142 if (c == LIT_CHAR_BOM
143#if !ENABLED (JERRY_ESNEXT)
144 /* Mongolian Vowel Separator (u180e) used to be a whitespace character. */
145 || c == LIT_CHAR_MVS
146#endif /* !ENABLED (JERRY_ESNEXT) */
147 || c == LIT_CHAR_LS
148 || c == LIT_CHAR_PS)
149 {
150 return true;
151 }
152
153 return (c <= LIT_UTF16_CODE_UNIT_MAX
154 && ((c >= lit_unicode_white_space_interval_starts[0]
155 && c <= lit_unicode_white_space_interval_starts[0] + lit_unicode_white_space_interval_lengths[0])
156 || lit_search_char_in_array ((ecma_char_t) c,
157 lit_unicode_white_space_chars,
158 NUM_OF_ELEMENTS (lit_unicode_white_space_chars))));
159} /* lit_char_is_white_space */
160
161/**
162 * Check if specified character is one of LineTerminator characters
163 *
164 * @return true - if the character is one of characters, listed in ECMA-262 v5, Table 3,
165 * false - otherwise
166 */
167bool
168lit_char_is_line_terminator (ecma_char_t c) /**< code unit */
169{
170 return (c == LIT_CHAR_LF
171 || c == LIT_CHAR_CR
172 || c == LIT_CHAR_LS
173 || c == LIT_CHAR_PS);
174} /* lit_char_is_line_terminator */
175
176/**
177 * Check if specified character is a Unicode ID_Start
178 *
179 * See also:
180 * ECMA-262 v1, 11.6: UnicodeIDStart
181 *
182 * @return true - if the codepoint has Unicode property "ID_Start"
183 * false - otherwise
184 */
185static bool
186lit_char_is_unicode_id_start (lit_code_point_t code_point) /**< code unit */
187{
188#if ENABLED (JERRY_ESNEXT)
189 if (JERRY_UNLIKELY (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN))
190 {
191 return (lit_search_codepoint_in_interval_array (code_point,
192 lit_unicode_id_start_interval_starts_sup,
193 lit_unicode_id_start_interval_lengths_sup,
194 NUM_OF_ELEMENTS (lit_unicode_id_start_interval_starts_sup))
195 || lit_search_codepoint_in_array (code_point,
196 lit_unicode_id_start_chars_sup,
197 NUM_OF_ELEMENTS (lit_unicode_id_start_chars_sup)));
198 }
199#else /* !ENABLED (JERRY_ESNEXT) */
200 JERRY_ASSERT (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN);
201#endif /* ENABLED (JERRY_ESNEXT) */
202
203 ecma_char_t c = (ecma_char_t) code_point;
204
205 return (lit_search_char_in_interval_array (c,
206 lit_unicode_id_start_interval_starts,
207 lit_unicode_id_start_interval_lengths,
208 NUM_OF_ELEMENTS (lit_unicode_id_start_interval_starts))
209 || lit_search_char_in_array (c, lit_unicode_id_start_chars, NUM_OF_ELEMENTS (lit_unicode_id_start_chars)));
210} /* lit_char_is_unicode_id_start */
211
212/**
213 * Check if specified character is a Unicode ID_Continue
214 *
215 * See also:
216 * ECMA-262 v1, 11.6: UnicodeIDContinue
217 *
218 * @return true - if the codepoint has Unicode property "ID_Continue"
219 * false - otherwise
220 */
221static bool
222lit_char_is_unicode_id_continue (lit_code_point_t code_point) /**< code unit */
223{
224 /* Each ID_Start codepoint is ID_Continue as well. */
225 if (lit_char_is_unicode_id_start (code_point))
226 {
227 return true;
228 }
229
230#if ENABLED (JERRY_ESNEXT)
231 if (JERRY_UNLIKELY (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN))
232 {
233 return (lit_search_codepoint_in_interval_array (code_point,
234 lit_unicode_id_continue_interval_starts_sup,
235 lit_unicode_id_continue_interval_lengths_sup,
236 NUM_OF_ELEMENTS (lit_unicode_id_continue_interval_starts_sup))
237 || lit_search_codepoint_in_array (code_point,
238 lit_unicode_id_continue_chars_sup,
239 NUM_OF_ELEMENTS (lit_unicode_id_continue_chars_sup)));
240 }
241#else /* !ENABLED (JERRY_ESNEXT) */
242 JERRY_ASSERT (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN);
243#endif /* ENABLED (JERRY_ESNEXT) */
244
245 ecma_char_t c = (ecma_char_t) code_point;
246
247 return (lit_search_char_in_interval_array (c,
248 lit_unicode_id_continue_interval_starts,
249 lit_unicode_id_continue_interval_lengths,
250 NUM_OF_ELEMENTS (lit_unicode_id_continue_interval_starts))
251 || lit_search_char_in_array (c,
252 lit_unicode_id_continue_chars,
253 NUM_OF_ELEMENTS (lit_unicode_id_continue_chars)));
254} /* lit_char_is_unicode_id_continue */
255
256/**
257 * Checks whether the character is a valid identifier start.
258 *
259 * @return true if it is.
260 */
261bool
262lit_code_point_is_identifier_start (lit_code_point_t code_point) /**< code point */
263{
264 /* Fast path for ASCII-defined letters. */
265 if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
266 {
267 return ((LEXER_TO_ASCII_LOWERCASE (code_point) >= LIT_CHAR_LOWERCASE_A
268 && LEXER_TO_ASCII_LOWERCASE (code_point) <= LIT_CHAR_LOWERCASE_Z)
269 || code_point == LIT_CHAR_DOLLAR_SIGN
270 || code_point == LIT_CHAR_UNDERSCORE);
271 }
272
273 return lit_char_is_unicode_id_start (code_point);
274} /* lit_code_point_is_identifier_start */
275
276/**
277 * Checks whether the character is a valid identifier part.
278 *
279 * @return true if it is.
280 */
281bool
282lit_code_point_is_identifier_part (lit_code_point_t code_point) /**< code point */
283{
284 /* Fast path for ASCII-defined letters. */
285 if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
286 {
287 return ((LEXER_TO_ASCII_LOWERCASE (code_point) >= LIT_CHAR_LOWERCASE_A
288 && LEXER_TO_ASCII_LOWERCASE (code_point) <= LIT_CHAR_LOWERCASE_Z)
289 || (code_point >= LIT_CHAR_0 && code_point <= LIT_CHAR_9)
290 || code_point == LIT_CHAR_DOLLAR_SIGN
291 || code_point == LIT_CHAR_UNDERSCORE);
292 }
293
294 return lit_char_is_unicode_id_continue (code_point);
295} /* lit_code_point_is_identifier_part */
296
297/**
298 * Check if specified character is one of OctalDigit characters (ECMA-262 v5, B.1.2)
299 *
300 * @return true / false
301 */
302bool
303lit_char_is_octal_digit (ecma_char_t c) /**< code unit */
304{
305 return (c >= LIT_CHAR_ASCII_OCTAL_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_OCTAL_DIGITS_END);
306} /* lit_char_is_octal_digit */
307
308/**
309 * Check if specified character is one of DecimalDigit characters (ECMA-262 v5, 7.8.3)
310 *
311 * @return true / false
312 */
313bool
314lit_char_is_decimal_digit (ecma_char_t c) /**< code unit */
315{
316 return (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END);
317} /* lit_char_is_decimal_digit */
318
319/**
320 * Check if specified character is one of HexDigit characters (ECMA-262 v5, 7.8.3)
321 *
322 * @return true / false
323 */
324bool
325lit_char_is_hex_digit (ecma_char_t c) /**< code unit */
326{
327 return ((c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
328 || (LEXER_TO_ASCII_LOWERCASE (c) >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN
329 && LEXER_TO_ASCII_LOWERCASE (c) <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END));
330} /* lit_char_is_hex_digit */
331
332#if ENABLED (JERRY_ESNEXT)
333/**
334 * Check if specified character is one of BinaryDigits characters (ECMA-262 v6, 11.8.3)
335 *
336 * @return true / false
337 */
338bool
339lit_char_is_binary_digit (ecma_char_t c) /** code unit */
340{
341 return (c == LIT_CHAR_0 || c == LIT_CHAR_1);
342} /* lit_char_is_binary_digit */
343#endif /* ENABLED (JERRY_ESNEXT) */
344
345/**
346 * UnicodeEscape abstract method
347 *
348 * See also: ECMA-262 v10, 24.5.2.3
349 */
350void
351lit_char_unicode_escape (ecma_stringbuilder_t *builder_p, /**< stringbuilder to append */
352 ecma_char_t c) /**< code unit to convert */
353{
354 ecma_stringbuilder_append_raw (builder_p, (lit_utf8_byte_t *) "\\u", 2);
355
356 for (int8_t i = 3; i >= 0; i--)
357 {
358 int32_t result_char = (c >> (i * 4)) & 0xF;
359 ecma_stringbuilder_append_byte (builder_p, (lit_utf8_byte_t) (result_char + (result_char <= 9
360 ? LIT_CHAR_0
361 : (LIT_CHAR_LOWERCASE_A - 10))));
362 }
363} /* lit_char_unicode_escape */
364
365/**
366 * Convert a HexDigit character to its numeric value, as defined in ECMA-262 v5, 7.8.3
367 *
368 * @return digit value, corresponding to the hex char
369 */
370uint32_t
371lit_char_hex_to_int (ecma_char_t c) /**< code unit, corresponding to
372 * one of HexDigit characters */
373{
374 JERRY_ASSERT (lit_char_is_hex_digit (c));
375
376 if (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
377 {
378 return (uint32_t) (c - LIT_CHAR_ASCII_DIGITS_BEGIN);
379 }
380 else if (c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END)
381 {
382 return (uint32_t) (c - LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN + 10);
383 }
384 else
385 {
386 return (uint32_t) (c - LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN + 10);
387 }
388} /* lit_char_hex_to_int */
389
390/**
391 * Converts a character to UTF8 bytes.
392 *
393 * @return length of the UTF8 representation.
394 */
395size_t
396lit_code_point_to_cesu8_bytes (uint8_t *dst_p, /**< destination buffer */
397 lit_code_point_t code_point) /**< code point */
398{
399 if (code_point < LIT_UTF8_2_BYTE_CODE_POINT_MIN)
400 {
401 /* 00000000 0xxxxxxx -> 0xxxxxxx */
402 dst_p[0] = (uint8_t) code_point;
403 return 1;
404 }
405
406 if (code_point < LIT_UTF8_3_BYTE_CODE_POINT_MIN)
407 {
408 /* 00000yyy yyxxxxxx -> 110yyyyy 10xxxxxx */
409 dst_p[0] = (uint8_t) (LIT_UTF8_2_BYTE_MARKER | ((code_point >> 6) & LIT_UTF8_LAST_5_BITS_MASK));
410 dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK));
411 return 2;
412 }
413
414 if (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN)
415 {
416 /* zzzzyyyy yyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx */
417 dst_p[0] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | ((code_point >> 12) & LIT_UTF8_LAST_4_BITS_MASK));
418 dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | ((code_point >> 6) & LIT_UTF8_LAST_6_BITS_MASK));
419 dst_p[2] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK));
420 return 3;
421 }
422
423 JERRY_ASSERT (code_point <= LIT_UNICODE_CODE_POINT_MAX);
424
425 code_point -= LIT_UTF8_4_BYTE_CODE_POINT_MIN;
426
427 dst_p[0] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | 0xd);
428 dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | 0x20 | ((code_point >> 16) & LIT_UTF8_LAST_4_BITS_MASK));
429 dst_p[2] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | ((code_point >> 10) & LIT_UTF8_LAST_6_BITS_MASK));
430
431 dst_p[3] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | 0xd);
432 dst_p[4] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | 0x30 | ((code_point >> 6) & LIT_UTF8_LAST_4_BITS_MASK));
433 dst_p[5] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK));
434
435 return 3 * 2;
436} /* lit_code_point_to_cesu8_bytes */
437
438/**
439 * Returns the length of the UTF8 representation of a character.
440 *
441 * @return length of the UTF8 representation.
442 */
443size_t
444lit_code_point_get_cesu8_length (lit_code_point_t code_point) /**< code point */
445{
446 if (code_point < LIT_UTF8_2_BYTE_CODE_POINT_MIN)
447 {
448 /* 00000000 0xxxxxxx */
449 return 1;
450 }
451
452 if (code_point < LIT_UTF8_3_BYTE_CODE_POINT_MIN)
453 {
454 /* 00000yyy yyxxxxxx */
455 return 2;
456 }
457
458 if (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN)
459 {
460 /* zzzzyyyy yyxxxxxx */
461 return 3;
462 }
463
464 /* high + low surrogate */
465 return 2 * 3;
466} /* lit_code_point_get_cesu8_length */
467
468/**
469 * Convert a four byte long utf8 character to two three byte long cesu8 characters
470 */
471void
472lit_four_byte_utf8_char_to_cesu8 (uint8_t *dst_p, /**< destination buffer */
473 const uint8_t *source_p) /**< source buffer */
474{
475 lit_code_point_t code_point = ((((uint32_t) source_p[0]) & LIT_UTF8_LAST_3_BITS_MASK) << 18);
476 code_point |= ((((uint32_t) source_p[1]) & LIT_UTF8_LAST_6_BITS_MASK) << 12);
477 code_point |= ((((uint32_t) source_p[2]) & LIT_UTF8_LAST_6_BITS_MASK) << 6);
478 code_point |= (((uint32_t) source_p[3]) & LIT_UTF8_LAST_6_BITS_MASK);
479
480 lit_code_point_to_cesu8_bytes (dst_p, code_point);
481} /* lit_four_byte_utf8_char_to_cesu8 */
482
483/**
484 * Lookup hex digits in a buffer
485 *
486 * @return UINT32_MAX - if next 'lookup' number of characters do not form a valid hex number
487 * value of hex number, otherwise
488 */
489uint32_t
490lit_char_hex_lookup (const lit_utf8_byte_t *buf_p, /**< buffer */
491 const lit_utf8_byte_t *const buf_end_p, /**< buffer end */
492 uint32_t lookup) /**< size of lookup */
493{
494 JERRY_ASSERT (lookup <= 4);
495
496 if (JERRY_UNLIKELY (buf_p + lookup > buf_end_p))
497 {
498 return UINT32_MAX;
499 }
500
501 uint32_t value = 0;
502
503 while (lookup--)
504 {
505 lit_utf8_byte_t ch = *buf_p++;
506 if (!lit_char_is_hex_digit (ch))
507 {
508 return UINT32_MAX;
509 }
510
511 value <<= 4;
512 value += lit_char_hex_to_int (ch);
513 }
514
515 JERRY_ASSERT (value <= LIT_UTF16_CODE_UNIT_MAX);
516 return value;
517} /* lit_char_hex_lookup */
518
519/**
520 * Parse a decimal number with the value clamped to UINT32_MAX.
521 *
522 * @returns uint32_t number
523 */
524uint32_t
525lit_parse_decimal (const lit_utf8_byte_t **buffer_p, /**< [in/out] character buffer */
526 const lit_utf8_byte_t *buffer_end_p) /**< buffer end */
527{
528 const lit_utf8_byte_t *current_p = *buffer_p;
529 JERRY_ASSERT (lit_char_is_decimal_digit (*current_p));
530
531 uint32_t value = (uint32_t) (*current_p++ - LIT_CHAR_0);
532
533 while (current_p < buffer_end_p && lit_char_is_decimal_digit (*current_p))
534 {
535 const uint32_t digit = (uint32_t) (*current_p++ - LIT_CHAR_0);
536 uint32_t new_value = value * 10 + digit;
537
538 if (JERRY_UNLIKELY (value > UINT32_MAX / 10) || JERRY_UNLIKELY (new_value < value))
539 {
540 value = UINT32_MAX;
541 continue;
542 }
543
544 value = new_value;
545 }
546
547 *buffer_p = current_p;
548 return value;
549} /* lit_parse_decimal */
550
551/**
552 * Check if specified character is a word character (part of IsWordChar abstract operation)
553 *
554 * See also: ECMA-262 v5, 15.10.2.6 (IsWordChar)
555 *
556 * @return true - if the character is a word character
557 * false - otherwise
558 */
559bool
560lit_char_is_word_char (lit_code_point_t c) /**< code point */
561{
562 return ((c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END)
563 || (c >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END)
564 || (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
565 || c == LIT_CHAR_UNDERSCORE);
566} /* lit_char_is_word_char */
567
568#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
569
570/**
571 * Check if the specified character is in one of those tables which contain bidirectional conversions.
572 *
573 * @return codepoint of the converted character if it is found the the tables
574 * LIT_INVALID_CP - otherwise.
575 */
576static lit_code_point_t
577lit_search_in_bidirectional_conversion_tables (lit_code_point_t cp, /**< code point */
578 bool is_lowercase) /**< is lowercase conversion */
579{
580 /* 1, Check if the specified character is part of the lit_unicode_character_case_ranges_{sup} table. */
581 int number_of_case_ranges;
582#if ENABLED (JERRY_ESNEXT)
583 bool is_supplementary = cp > LIT_UTF16_CODE_UNIT_MAX;
584 if (is_supplementary)
585 {
586 number_of_case_ranges = NUM_OF_ELEMENTS (lit_unicode_character_case_ranges_sup);
587 }
588 else
589#endif /* ENABLED (JERRY_ESNEXT) */
590 {
591 number_of_case_ranges = NUM_OF_ELEMENTS (lit_unicode_character_case_ranges);
592 }
593
594 int conv_counter = 0;
595
596 for (int i = 0; i < number_of_case_ranges; i++)
597 {
598 if (i % 2 == 0 && i > 0)
599 {
600 conv_counter++;
601 }
602
603 size_t range_length;
604 lit_code_point_t start_point;
605#if ENABLED (JERRY_ESNEXT)
606 if (is_supplementary)
607 {
608 range_length = lit_unicode_character_case_range_lengths_sup[conv_counter];
609 start_point = lit_unicode_character_case_ranges_sup[i];
610 }
611 else
612#endif /* ENABLED (JERRY_ESNEXT) */
613 {
614 range_length = lit_unicode_character_case_range_lengths[conv_counter];
615 start_point = lit_unicode_character_case_ranges[i];
616 }
617
618 if (start_point > cp || cp >= start_point + range_length)
619 {
620 continue;
621 }
622
623 uint32_t char_dist = (uint32_t) cp - start_point;
624 int offset;
625 if (i % 2 == 0)
626 {
627 if (!is_lowercase)
628 {
629 return cp;
630 }
631
632 offset = i + 1;
633 }
634 else
635 {
636 if (is_lowercase)
637 {
638 return cp;
639 }
640
641 offset = i - 1;
642 }
643
644#if ENABLED (JERRY_ESNEXT)
645 if (is_supplementary)
646 {
647 start_point = lit_unicode_character_case_ranges_sup[offset];
648 }
649 else
650#endif /* ENABLED (JERRY_ESNEXT) */
651 {
652 start_point = lit_unicode_character_case_ranges[offset];
653 }
654
655 return (lit_code_point_t) (start_point + char_dist);
656 }
657
658 /* Note: After this point based on the latest unicode standard(13.0.0.6) no conversion characters are
659 defined for supplementary planes */
660#if ENABLED (JERRY_ESNEXT)
661 if (is_supplementary)
662 {
663 return cp;
664 }
665#endif /* ENABLED (JERRY_ESNEXT) */
666
667 /* 2, Check if the specified character is part of the character_pair_ranges table. */
668 int bottom = 0;
669 int top = NUM_OF_ELEMENTS (lit_unicode_character_pair_ranges) - 1;
670
671 while (bottom <= top)
672 {
673 int middle = (bottom + top) / 2;
674 lit_code_point_t current_sp = lit_unicode_character_pair_ranges[middle];
675
676 if (current_sp <= cp && cp < current_sp + lit_unicode_character_pair_range_lengths[middle])
677 {
678 uint32_t char_dist = (uint32_t) (cp - current_sp);
679
680 if ((cp - current_sp) % 2 == 0)
681 {
682 return is_lowercase ? (lit_code_point_t) (current_sp + char_dist + 1) : cp;
683 }
684
685 return is_lowercase ? cp : (lit_code_point_t) (current_sp + char_dist - 1);
686 }
687
688 if (cp > current_sp)
689 {
690 bottom = middle + 1;
691 }
692 else
693 {
694 top = middle - 1;
695 }
696 }
697
698 /* 3, Check if the specified character is part of the character_pairs table. */
699 int number_of_character_pairs = NUM_OF_ELEMENTS (lit_unicode_character_pairs);
700
701 for (int i = 0; i < number_of_character_pairs; i++)
702 {
703 if (cp != lit_unicode_character_pairs[i])
704 {
705 continue;
706 }
707
708 if (i % 2 == 0)
709 {
710 return is_lowercase ? lit_unicode_character_pairs[i + 1] : cp;
711 }
712
713 return is_lowercase ? cp : lit_unicode_character_pairs[i - 1];
714 }
715
716 return LIT_INVALID_CP;
717} /* lit_search_in_bidirectional_conversion_tables */
718
719/**
720 * Check if the specified character is in the given conversion table.
721 *
722 * @return LIT_MULTIPLE_CU if the converted character consist more than a single code unit
723 * converted code point - otherwise
724 */
725static lit_code_point_t
726lit_search_in_conversion_table (ecma_char_t character, /**< code unit */
727 ecma_stringbuilder_t *builder_p, /**< string builder */
728 const ecma_char_t *array, /**< array */
729 const uint8_t *counters) /**< case_values counter */
730{
731 int end_point = 0;
732
733 for (int i = 0; i < 3; i++)
734 {
735 int start_point = end_point;
736 int size_of_case_value = i + 1;
737 end_point += counters[i] * (size_of_case_value + 1);
738
739 int bottom = start_point;
740 int top = end_point - size_of_case_value;
741
742 while (bottom <= top)
743 {
744 int middle = (bottom + top) / 2;
745
746 middle -= ((middle - bottom) % (size_of_case_value + 1));
747
748 ecma_char_t current = array[middle];
749
750 if (current == character)
751 {
752 if (builder_p != NULL)
753 {
754 ecma_stringbuilder_append_char (builder_p, array[middle + 1]);
755
756 if (size_of_case_value > 1)
757 {
758 ecma_stringbuilder_append_char (builder_p, array[middle + 2]);
759 }
760 if (size_of_case_value > 2)
761 {
762 ecma_stringbuilder_append_char (builder_p, array[middle + 3]);
763 }
764 }
765
766 return size_of_case_value == 1 ? array[middle + 1]: LIT_MULTIPLE_CU;
767 }
768
769 if (character < current)
770 {
771 top = middle - (size_of_case_value + 1);
772 }
773 else
774 {
775 bottom = middle + (size_of_case_value + 1);
776 }
777 }
778 }
779
780 if (builder_p != NULL)
781 {
782 ecma_stringbuilder_append_char (builder_p, character);
783 }
784
785 return (lit_code_point_t) character;
786} /* lit_search_in_conversion_table */
787#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
788
789/**
790 * Append the converted lowercase codeunit sequence of an a given codepoint into the stringbuilder if it is present.
791 *
792 * @return LIT_MULTIPLE_CU if the converted codepoint consist more than a single code unit
793 * converted code point - otherwise
794 */
795lit_code_point_t
796lit_char_to_lower_case (lit_code_point_t cp, /**< code point */
797 ecma_stringbuilder_t *builder_p) /**< string builder */
798{
799 if (cp <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
800 {
801 if (cp >= LIT_CHAR_UPPERCASE_A && cp <= LIT_CHAR_UPPERCASE_Z)
802 {
803 cp = (lit_utf8_byte_t) (cp + (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
804 }
805
806 if (builder_p != NULL)
807 {
808 ecma_stringbuilder_append_byte (builder_p, (lit_utf8_byte_t) cp);
809 }
810
811 return cp;
812 }
813
814#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
815 lit_code_point_t lowercase_cp = lit_search_in_bidirectional_conversion_tables (cp, true);
816
817 if (lowercase_cp != LIT_INVALID_CP)
818 {
819 if (builder_p != NULL)
820 {
821 ecma_stringbuilder_append_codepoint (builder_p, lowercase_cp);
822 }
823
824 return lowercase_cp;
825 }
826
827 JERRY_ASSERT (cp < LIT_UTF8_4_BYTE_CODE_POINT_MIN);
828
829 int num_of_lowercase_ranges = NUM_OF_ELEMENTS (lit_unicode_lower_case_ranges);
830
831 for (int i = 0, j = 0; i < num_of_lowercase_ranges; i += 2, j++)
832 {
833 JERRY_ASSERT (lit_unicode_lower_case_range_lengths[j] > 0);
834 uint32_t range_length = (uint32_t) (lit_unicode_lower_case_range_lengths[j] - 1);
835 lit_code_point_t start_point = lit_unicode_lower_case_ranges[i];
836
837 if (start_point <= cp && cp <= start_point + range_length)
838 {
839 lowercase_cp = lit_unicode_lower_case_ranges[i + 1] + (cp - start_point);
840 if (builder_p != NULL)
841 {
842 ecma_stringbuilder_append_codepoint (builder_p, lowercase_cp);
843 }
844
845 return lowercase_cp;
846 }
847 }
848
849 return lit_search_in_conversion_table ((ecma_char_t) cp,
850 builder_p,
851 lit_unicode_lower_case_conversions,
852 lit_unicode_lower_case_conversion_counters);
853#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
854 if (builder_p != NULL)
855 {
856 ecma_stringbuilder_append_codepoint (builder_p, cp);
857 }
858
859 return cp;
860#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
861} /* lit_char_to_lower_case */
862
863/**
864 * Append the converted uppercase codeunit sequence of an a given codepoint into the stringbuilder if it is present.
865 *
866 * @return LIT_MULTIPLE_CU if the converted codepoint consist more than a single code unit
867 * converted code point - otherwise
868 */
869lit_code_point_t
870lit_char_to_upper_case (lit_code_point_t cp, /**< code point */
871 ecma_stringbuilder_t *builder_p) /**< string builder */
872{
873 if (cp <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
874 {
875 if (cp >= LIT_CHAR_LOWERCASE_A && cp <= LIT_CHAR_LOWERCASE_Z)
876 {
877 cp = (lit_utf8_byte_t) (cp - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
878 }
879
880 if (builder_p != NULL)
881 {
882 ecma_stringbuilder_append_byte (builder_p, (lit_utf8_byte_t) cp);
883 }
884
885 return cp;
886 }
887
888#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
889 lit_code_point_t uppercase_cp = lit_search_in_bidirectional_conversion_tables (cp, false);
890
891 if (uppercase_cp != LIT_INVALID_CP)
892 {
893 if (builder_p != NULL)
894 {
895 ecma_stringbuilder_append_codepoint (builder_p, uppercase_cp);
896 }
897
898 return uppercase_cp;
899 }
900
901 int num_of_upper_case_special_ranges = NUM_OF_ELEMENTS (lit_unicode_upper_case_special_ranges);
902
903 for (int i = 0, j = 0; i < num_of_upper_case_special_ranges; i += 3, j++)
904 {
905 uint32_t range_length = lit_unicode_upper_case_special_range_lengths[j];
906 ecma_char_t start_point = lit_unicode_upper_case_special_ranges[i];
907
908 if (start_point <= cp && cp <= start_point + range_length)
909 {
910 if (builder_p != NULL)
911 {
912 uppercase_cp = lit_unicode_upper_case_special_ranges[i + 1] + (cp - start_point);
913 ecma_stringbuilder_append_codepoint (builder_p, uppercase_cp);
914 ecma_stringbuilder_append_codepoint (builder_p, lit_unicode_upper_case_special_ranges[i + 2]);
915 }
916
917 return LIT_MULTIPLE_CU;
918 }
919 }
920
921 return lit_search_in_conversion_table ((ecma_char_t) cp,
922 builder_p,
923 lit_unicode_upper_case_conversions,
924 lit_unicode_upper_case_conversion_counters);
925#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
926 if (builder_p != NULL)
927 {
928 ecma_stringbuilder_append_codepoint (builder_p, cp);
929 }
930
931 return cp;
932#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
933} /* lit_char_to_upper_case */
934
935#if ENABLED (JERRY_ESNEXT)
936/*
937 * Look up whether the character should be folded to the lowercase variant.
938 *
939 * @return true, if character should be lowercased
940 * false, otherwise
941 */
942bool
943lit_char_fold_to_lower (lit_code_point_t cp) /**< code point */
944{
945#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
946 return (cp <= LIT_UTF8_1_BYTE_CODE_POINT_MAX
947 || cp > LIT_UTF16_CODE_UNIT_MAX
948 || (!lit_search_char_in_interval_array ((ecma_char_t) cp,
949 lit_unicode_folding_skip_to_lower_interval_starts,
950 lit_unicode_folding_skip_to_lower_interval_lengths,
951 NUM_OF_ELEMENTS (lit_unicode_folding_skip_to_lower_interval_starts))
952 && !lit_search_char_in_array ((ecma_char_t) cp,
953 lit_unicode_folding_skip_to_lower_chars,
954 NUM_OF_ELEMENTS (lit_unicode_folding_skip_to_lower_chars))));
955#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
956 return true;
957#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
958} /* lit_char_fold_to_lower */
959
960/*
961 * Look up whether the character should be folded to the uppercase variant.
962 *
963 * @return true, if character should be uppercased
964 * false, otherwise
965 */
966bool
967lit_char_fold_to_upper (lit_code_point_t cp) /**< code point */
968{
969#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
970 return (cp > LIT_UTF8_1_BYTE_CODE_POINT_MAX
971 && cp <= LIT_UTF16_CODE_UNIT_MAX
972 && (lit_search_char_in_interval_array ((ecma_char_t) cp,
973 lit_unicode_folding_to_upper_interval_starts,
974 lit_unicode_folding_to_upper_interval_lengths,
975 NUM_OF_ELEMENTS (lit_unicode_folding_to_upper_interval_starts))
976 || lit_search_char_in_array ((ecma_char_t) cp,
977 lit_unicode_folding_to_upper_chars,
978 NUM_OF_ELEMENTS (lit_unicode_folding_to_upper_chars))));
979#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
980 return false;
981#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
982} /* lit_char_fold_to_upper */
983#endif /* ENABLED (JERRY_ESNEXT) */
984
985/**
986 * Helper method to find a specific character in a string
987 *
988 * Used by:
989 * ecma_builtin_string_prototype_object_replace_helper
990 *
991 * @return true - if the given character is in the string
992 * false - otherwise
993 */
994bool
995lit_find_char_in_string (ecma_string_t *str_p, /**< source string */
996 lit_utf8_byte_t c) /**< character to find*/
997{
998 ECMA_STRING_TO_UTF8_STRING (str_p, start_p, start_size);
999
1000 const lit_utf8_byte_t *str_curr_p = start_p;
1001 const lit_utf8_byte_t *str_end_p = start_p + start_size;
1002 bool have_char = false;
1003
1004 while (str_curr_p < str_end_p)
1005 {
1006 if (*str_curr_p++ == c)
1007 {
1008 have_char = true;
1009 break;
1010 }
1011 }
1012
1013 ECMA_FINALIZE_UTF8_STRING (start_p, start_size);
1014
1015 return have_char;
1016} /* lit_find_char_in_string */
1017