lit-char-helpers.c source code [jerryscript/jerry-core/lit/lit-char-helpers.c]

1	/ Copyright JS Foundation and other contributors, http://js.foundation*
2	*
3	* Licensed under the Apache License, Version 2.0 (the "License");
4	* you may not use this file except in compliance with the License.
5	* You may obtain a copy of the License at
6	*
7	* http://www.apache.org/licenses/LICENSE-2.0
8	*
9	* Unless required by applicable law or agreed to in writing, software
10	* distributed under the License is distributed on an "AS IS" BASIS
11	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	* See the License for the specific language governing permissions and
13	* limitations under the License.
14	*/
15
16	#include "config.h"
17	#include "ecma-helpers.h"
18	#include "lit-char-helpers.h"
19	#include "lit-strings.h"
20	#include "lit-unicode-ranges.inc.h"
21	#if ENABLED (JERRY_ESNEXT)
22	#include "lit-unicode-ranges-sup.inc.h"
23	#endif /* ENABLED (JERRY_ESNEXT) */
24
25	#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
26	#include "lit-unicode-conversions.inc.h"
27	#if ENABLED (JERRY_ESNEXT)
28	#include "lit-unicode-conversions-sup.inc.h"
29	#include "lit-unicode-folding.inc.h"
30	#endif /* ENABLED (JERRY_ESNEXT) */
31	#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
32
33	#define NUM_OF_ELEMENTS(array) (sizeof (array) / sizeof ((array)[0]))
34
35	/**
36	* Binary search algorithm that searches the a
37	* character in the given char array.
38	*
39	* @return true - if the character is in the given array
40	* false - otherwise
41	*/
42	#define LIT_SEARCH_CHAR_IN_ARRAY_FN(function_name, char_type, array_type) \
43	static bool \
44	function_name (char_type c, /*< code unit / \
45	const array_type array, /< array / \
46	int size_of_array) /*< length of the array /\
47	{ \
48	int bottom = 0; \
49	int top = size_of_array - 1; \
50	\
51	while (bottom <= top) \
52	{ \
53	int middle = (bottom + top) / 2; \
54	char_type current = array[middle]; \
55	\
56	if (current == c) \
57	{ \
58	return true; \
59	} \
60	\
61	if (c < current) \
62	{ \
63	top = middle - 1; \
64	} \
65	else \
66	{ \
67	bottom = middle + 1; \
68	} \
69	} \
70	\
71	return false; \
72	} /* __function_name */
73
74	LIT_SEARCH_CHAR_IN_ARRAY_FN (lit_search_char_in_array, ecma_char_t, uint16_t)
75
76	#if ENABLED (JERRY_ESNEXT)
77	LIT_SEARCH_CHAR_IN_ARRAY_FN (lit_search_codepoint_in_array, lit_code_point_t, uint32_t)
78	#endif /* ENABLED (JERRY_ESNEXT) */
79
80	/**
81	* Binary search algorithm that searches a character in the given intervals.
82	* Intervals specifed by two arrays. The first one contains the starting points
83	* of the intervals, the second one contains the length of them.
84	*
85	* @return true - if the the character is included (inclusively) in one of the intervals in the given array
86	* false - otherwise
87	*/
88	#define LIT_SEARCH_CHAR_IN_INTERVAL_ARRAY_FN(function_name, char_type, array_type, interval_type) \
89	static bool \
90	function_name (char_type c, /*< code unit / \
91	const array_type array_sp, /< array of interval starting points / \
92	const interval_type lengths, /< array of interval lengths / \
93	int size_of_array) /*< length of the array / \
94	{ \
95	int bottom = 0; \
96	int top = size_of_array - 1; \
97	\
98	while (bottom <= top) \
99	{ \
100	int middle = (bottom + top) / 2; \
101	char_type current_sp = array_sp[middle]; \
102	\
103	if (current_sp <= c && c <= current_sp + lengths[middle]) \
104	{ \
105	return true; \
106	} \
107	\
108	if (c > current_sp) \
109	{ \
110	bottom = middle + 1; \
111	} \
112	else \
113	{ \
114	top = middle - 1; \
115	} \
116	} \
117	\
118	return false; \
119	} /* function_name */
120
121	LIT_SEARCH_CHAR_IN_INTERVAL_ARRAY_FN (lit_search_char_in_interval_array, ecma_char_t, uint16_t, uint8_t)
122
123	#if ENABLED (JERRY_ESNEXT)
124	LIT_SEARCH_CHAR_IN_INTERVAL_ARRAY_FN (lit_search_codepoint_in_interval_array, lit_code_point_t, uint32_t, uint16_t)
125	#endif /* ENABLED (JERRY_ESNEXT) */
126
127	/**
128	* Check if specified character is one of the Whitespace characters including those that fall into
129	* "Space, Separator" ("Zs") Unicode character category or one of the Line Terminator characters.
130	*
131	* @return true - if the character is one of characters, listed in ECMA-262 v5, Table 2,
132	* false - otherwise
133	*/
134	bool
135	lit_char_is_white_space (lit_code_point_t c) /< code point /*
136	{
137	if (c <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
138	{
139	return (c == LIT_CHAR_SP \|\| (c >= LIT_CHAR_TAB && c <= LIT_CHAR_CR));
140	}
141
142	if (c == LIT_CHAR_BOM
143	#if !ENABLED (JERRY_ESNEXT)
144	/ Mongolian Vowel Separator (u180e) used to be a whitespace character. /
145	\|\| c == LIT_CHAR_MVS
146	#endif /* !ENABLED (JERRY_ESNEXT) */
147	\|\| c == LIT_CHAR_LS
148	\|\| c == LIT_CHAR_PS)
149	{
150	return true;
151	}
152
153	return (c <= LIT_UTF16_CODE_UNIT_MAX
154	&& ((c >= lit_unicode_white_space_interval_starts[`0`]
155	&& c <= lit_unicode_white_space_interval_starts[`0`] + lit_unicode_white_space_interval_lengths[`0`])
156	\|\| lit_search_char_in_array ((ecma_char_t) c,
157	lit_unicode_white_space_chars,
158	NUM_OF_ELEMENTS (lit_unicode_white_space_chars))));
159	} / lit_char_is_white_space /
160
161	/**
162	* Check if specified character is one of LineTerminator characters
163	*
164	* @return true - if the character is one of characters, listed in ECMA-262 v5, Table 3,
165	* false - otherwise
166	*/
167	bool
168	lit_char_is_line_terminator (ecma_char_t c) /< code unit /*
169	{
170	return (c == LIT_CHAR_LF
171	\|\| c == LIT_CHAR_CR
172	\|\| c == LIT_CHAR_LS
173	\|\| c == LIT_CHAR_PS);
174	} / lit_char_is_line_terminator /
175
176	/**
177	* Check if specified character is a Unicode ID_Start
178	*
179	* See also:
180	* ECMA-262 v1, 11.6: UnicodeIDStart
181	*
182	* @return true - if the codepoint has Unicode property "ID_Start"
183	* false - otherwise
184	*/
185	static bool
186	lit_char_is_unicode_id_start (lit_code_point_t code_point) /< code unit /*
187	{
188	#if ENABLED (JERRY_ESNEXT)
189	if (JERRY_UNLIKELY (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN))
190	{
191	return (lit_search_codepoint_in_interval_array (code_point,
192	lit_unicode_id_start_interval_starts_sup,
193	lit_unicode_id_start_interval_lengths_sup,
194	NUM_OF_ELEMENTS (lit_unicode_id_start_interval_starts_sup))
195	\|\| lit_search_codepoint_in_array (code_point,
196	lit_unicode_id_start_chars_sup,
197	NUM_OF_ELEMENTS (lit_unicode_id_start_chars_sup)));
198	}
199	#else /* !ENABLED (JERRY_ESNEXT) */
200	JERRY_ASSERT (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN);
201	#endif /* ENABLED (JERRY_ESNEXT) */
202
203	ecma_char_t c = (ecma_char_t) code_point;
204
205	return (lit_search_char_in_interval_array (c,
206	lit_unicode_id_start_interval_starts,
207	lit_unicode_id_start_interval_lengths,
208	NUM_OF_ELEMENTS (lit_unicode_id_start_interval_starts))
209	\|\| lit_search_char_in_array (c, lit_unicode_id_start_chars, NUM_OF_ELEMENTS (lit_unicode_id_start_chars)));
210	} / lit_char_is_unicode_id_start /
211
212	/**
213	* Check if specified character is a Unicode ID_Continue
214	*
215	* See also:
216	* ECMA-262 v1, 11.6: UnicodeIDContinue
217	*
218	* @return true - if the codepoint has Unicode property "ID_Continue"
219	* false - otherwise
220	*/
221	static bool
222	lit_char_is_unicode_id_continue (lit_code_point_t code_point) /< code unit /*
223	{
224	/ Each ID_Start codepoint is ID_Continue as well. /
225	if (lit_char_is_unicode_id_start (code_point))
226	{
227	return true;
228	}
229
230	#if ENABLED (JERRY_ESNEXT)
231	if (JERRY_UNLIKELY (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN))
232	{
233	return (lit_search_codepoint_in_interval_array (code_point,
234	lit_unicode_id_continue_interval_starts_sup,
235	lit_unicode_id_continue_interval_lengths_sup,
236	NUM_OF_ELEMENTS (lit_unicode_id_continue_interval_starts_sup))
237	\|\| lit_search_codepoint_in_array (code_point,
238	lit_unicode_id_continue_chars_sup,
239	NUM_OF_ELEMENTS (lit_unicode_id_continue_chars_sup)));
240	}
241	#else /* !ENABLED (JERRY_ESNEXT) */
242	JERRY_ASSERT (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN);
243	#endif /* ENABLED (JERRY_ESNEXT) */
244
245	ecma_char_t c = (ecma_char_t) code_point;
246
247	return (lit_search_char_in_interval_array (c,
248	lit_unicode_id_continue_interval_starts,
249	lit_unicode_id_continue_interval_lengths,
250	NUM_OF_ELEMENTS (lit_unicode_id_continue_interval_starts))
251	\|\| lit_search_char_in_array (c,
252	lit_unicode_id_continue_chars,
253	NUM_OF_ELEMENTS (lit_unicode_id_continue_chars)));
254	} / lit_char_is_unicode_id_continue /
255
256	/**
257	* Checks whether the character is a valid identifier start.
258	*
259	* @return true if it is.
260	*/
261	bool
262	lit_code_point_is_identifier_start (lit_code_point_t code_point) /< code point /*
263	{
264	/ Fast path for ASCII-defined letters. /
265	if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
266	{
267	return ((LEXER_TO_ASCII_LOWERCASE (code_point) >= LIT_CHAR_LOWERCASE_A
268	&& LEXER_TO_ASCII_LOWERCASE (code_point) <= LIT_CHAR_LOWERCASE_Z)
269	\|\| code_point == LIT_CHAR_DOLLAR_SIGN
270	\|\| code_point == LIT_CHAR_UNDERSCORE);
271	}
272
273	return lit_char_is_unicode_id_start (code_point);
274	} / lit_code_point_is_identifier_start /
275
276	/**
277	* Checks whether the character is a valid identifier part.
278	*
279	* @return true if it is.
280	*/
281	bool
282	lit_code_point_is_identifier_part (lit_code_point_t code_point) /< code point /*
283	{
284	/ Fast path for ASCII-defined letters. /
285	if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
286	{
287	return ((LEXER_TO_ASCII_LOWERCASE (code_point) >= LIT_CHAR_LOWERCASE_A
288	&& LEXER_TO_ASCII_LOWERCASE (code_point) <= LIT_CHAR_LOWERCASE_Z)
289	\|\| (code_point >= LIT_CHAR_0 && code_point <= LIT_CHAR_9)
290	\|\| code_point == LIT_CHAR_DOLLAR_SIGN
291	\|\| code_point == LIT_CHAR_UNDERSCORE);
292	}
293
294	return lit_char_is_unicode_id_continue (code_point);
295	} / lit_code_point_is_identifier_part /
296
297	/**
298	* Check if specified character is one of OctalDigit characters (ECMA-262 v5, B.1.2)
299	*
300	* @return true / false
301	*/
302	bool
303	lit_char_is_octal_digit (ecma_char_t c) /< code unit /*
304	{
305	return (c >= LIT_CHAR_ASCII_OCTAL_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_OCTAL_DIGITS_END);
306	} / lit_char_is_octal_digit /
307
308	/**
309	* Check if specified character is one of DecimalDigit characters (ECMA-262 v5, 7.8.3)
310	*
311	* @return true / false
312	*/
313	bool
314	lit_char_is_decimal_digit (ecma_char_t c) /< code unit /*
315	{
316	return (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END);
317	} / lit_char_is_decimal_digit /
318
319	/**
320	* Check if specified character is one of HexDigit characters (ECMA-262 v5, 7.8.3)
321	*
322	* @return true / false
323	*/
324	bool
325	lit_char_is_hex_digit (ecma_char_t c) /< code unit /*
326	{
327	return ((c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
328	\|\| (LEXER_TO_ASCII_LOWERCASE (c) >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN
329	&& LEXER_TO_ASCII_LOWERCASE (c) <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END));
330	} / lit_char_is_hex_digit /
331
332	#if ENABLED (JERRY_ESNEXT)
333	/**
334	* Check if specified character is one of BinaryDigits characters (ECMA-262 v6, 11.8.3)
335	*
336	* @return true / false
337	*/
338	bool
339	lit_char_is_binary_digit (ecma_char_t c) /* code unit /
340	{
341	return (c == LIT_CHAR_0 \|\| c == LIT_CHAR_1);
342	} / lit_char_is_binary_digit /
343	#endif /* ENABLED (JERRY_ESNEXT) */
344
345	/**
346	* UnicodeEscape abstract method
347	*
348	* See also: ECMA-262 v10, 24.5.2.3
349	*/
350	void
351	lit_char_unicode_escape (ecma_stringbuilder_t builder_p, /*< stringbuilder to append /*
352	ecma_char_t c) /< code unit to convert /*
353	{
354	ecma_stringbuilder_append_raw (builder_p, (lit_utf8_byte_t *) "\\u", `2`);
355
356	for (int8_t i = `3`; i >= `0`; i--)
357	{
358	int32_t result_char = (c >> (i * `4`)) & `0xF`;
359	ecma_stringbuilder_append_byte (builder_p, (lit_utf8_byte_t) (result_char + (result_char <= `9`
360	? LIT_CHAR_0
361	: (LIT_CHAR_LOWERCASE_A - `10`))));
362	}
363	} / lit_char_unicode_escape /
364
365	/**
366	* Convert a HexDigit character to its numeric value, as defined in ECMA-262 v5, 7.8.3
367	*
368	* @return digit value, corresponding to the hex char
369	*/
370	uint32_t
371	lit_char_hex_to_int (ecma_char_t c) /< code unit, corresponding to
372	* one of HexDigit characters */
373	{
374	JERRY_ASSERT (lit_char_is_hex_digit (c));
375
376	if (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
377	{
378	return (uint32_t) (c - LIT_CHAR_ASCII_DIGITS_BEGIN);
379	}
380	else if (c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END)
381	{
382	return (uint32_t) (c - LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN + `10`);
383	}
384	else
385	{
386	return (uint32_t) (c - LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN + `10`);
387	}
388	} / lit_char_hex_to_int /
389
390	/**
391	* Converts a character to UTF8 bytes.
392	*
393	* @return length of the UTF8 representation.
394	*/
395	size_t
396	lit_code_point_to_cesu8_bytes (uint8_t dst_p, /*< destination buffer /*
397	lit_code_point_t code_point) /< code point /*
398	{
399	if (code_point < LIT_UTF8_2_BYTE_CODE_POINT_MIN)
400	{
401	/ 00000000 0xxxxxxx -> 0xxxxxxx /
402	dst_p[`0`] = (uint8_t) code_point;
403	return `1`;
404	}
405
406	if (code_point < LIT_UTF8_3_BYTE_CODE_POINT_MIN)
407	{
408	/ 00000yyy yyxxxxxx -> 110yyyyy 10xxxxxx /
409	dst_p[`0`] = (uint8_t) (LIT_UTF8_2_BYTE_MARKER \| ((code_point >> `6`) & LIT_UTF8_LAST_5_BITS_MASK));
410	dst_p[`1`] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER \| (code_point & LIT_UTF8_LAST_6_BITS_MASK));
411	return `2`;
412	}
413
414	if (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN)
415	{
416	/ zzzzyyyy yyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx /
417	dst_p[`0`] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER \| ((code_point >> `12`) & LIT_UTF8_LAST_4_BITS_MASK));
418	dst_p[`1`] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER \| ((code_point >> `6`) & LIT_UTF8_LAST_6_BITS_MASK));
419	dst_p[`2`] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER \| (code_point & LIT_UTF8_LAST_6_BITS_MASK));
420	return `3`;
421	}
422
423	JERRY_ASSERT (code_point <= LIT_UNICODE_CODE_POINT_MAX);
424
425	code_point -= LIT_UTF8_4_BYTE_CODE_POINT_MIN;
426
427	dst_p[`0`] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER \| `0xd`);
428	dst_p[`1`] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER \| `0x20` \| ((code_point >> `16`) & LIT_UTF8_LAST_4_BITS_MASK));
429	dst_p[`2`] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER \| ((code_point >> `10`) & LIT_UTF8_LAST_6_BITS_MASK));
430
431	dst_p[`3`] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER \| `0xd`);
432	dst_p[`4`] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER \| `0x30` \| ((code_point >> `6`) & LIT_UTF8_LAST_4_BITS_MASK));
433	dst_p[`5`] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER \| (code_point & LIT_UTF8_LAST_6_BITS_MASK));
434
435	return `3` * `2`;
436	} / lit_code_point_to_cesu8_bytes /
437
438	/**
439	* Returns the length of the UTF8 representation of a character.
440	*
441	* @return length of the UTF8 representation.
442	*/
443	size_t
444	lit_code_point_get_cesu8_length (lit_code_point_t code_point) /< code point /*
445	{
446	if (code_point < LIT_UTF8_2_BYTE_CODE_POINT_MIN)
447	{
448	/ 00000000 0xxxxxxx /
449	return `1`;
450	}
451
452	if (code_point < LIT_UTF8_3_BYTE_CODE_POINT_MIN)
453	{
454	/ 00000yyy yyxxxxxx /
455	return `2`;
456	}
457
458	if (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN)
459	{
460	/ zzzzyyyy yyxxxxxx /
461	return `3`;
462	}
463
464	/ high + low surrogate /
465	return `2` * `3`;
466	} / lit_code_point_get_cesu8_length /
467
468	/**
469	* Convert a four byte long utf8 character to two three byte long cesu8 characters
470	*/
471	void
472	lit_four_byte_utf8_char_to_cesu8 (uint8_t dst_p, /*< destination buffer /*
473	const uint8_t source_p) /*< source buffer /*
474	{
475	lit_code_point_t code_point = ((((uint32_t) source_p[`0`]) & LIT_UTF8_LAST_3_BITS_MASK) << `18`);
476	code_point \|= ((((uint32_t) source_p[`1`]) & LIT_UTF8_LAST_6_BITS_MASK) << `12`);
477	code_point \|= ((((uint32_t) source_p[`2`]) & LIT_UTF8_LAST_6_BITS_MASK) << `6`);
478	code_point \|= (((uint32_t) source_p[`3`]) & LIT_UTF8_LAST_6_BITS_MASK);
479
480	lit_code_point_to_cesu8_bytes (dst_p, code_point);
481	} / lit_four_byte_utf8_char_to_cesu8 /
482
483	/**
484	* Lookup hex digits in a buffer
485	*
486	* @return UINT32_MAX - if next 'lookup' number of characters do not form a valid hex number
487	* value of hex number, otherwise
488	*/
489	uint32_t
490	lit_char_hex_lookup (const lit_utf8_byte_t buf_p, /*< buffer /*
491	const lit_utf8_byte_t *const buf_end_p, /< buffer end /*
492	uint32_t lookup) /< size of lookup /*
493	{
494	JERRY_ASSERT (lookup <= `4`);
495
496	if (JERRY_UNLIKELY (buf_p + lookup > buf_end_p))
497	{
498	return UINT32_MAX;
499	}
500
501	uint32_t value = `0`;
502
503	while (lookup--)
504	{
505	lit_utf8_byte_t ch = *buf_p++;
506	if (!lit_char_is_hex_digit (ch))
507	{
508	return UINT32_MAX;
509	}
510
511	value <<= `4`;
512	value += lit_char_hex_to_int (ch);
513	}
514
515	JERRY_ASSERT (value <= LIT_UTF16_CODE_UNIT_MAX);
516	return value;
517	} / lit_char_hex_lookup /
518
519	/**
520	* Parse a decimal number with the value clamped to UINT32_MAX.
521	*
522	* @returns uint32_t number
523	*/
524	uint32_t
525	lit_parse_decimal (const lit_utf8_byte_t *buffer_p, /*< [in/out] character buffer /*
526	const lit_utf8_byte_t buffer_end_p) /*< buffer end /*
527	{
528	const lit_utf8_byte_t current_p = buffer_p;
529	JERRY_ASSERT (lit_char_is_decimal_digit (*current_p));
530
531	uint32_t value = (uint32_t) (*current_p++ - LIT_CHAR_0);
532
533	while (current_p < buffer_end_p && lit_char_is_decimal_digit (*current_p))
534	{
535	const uint32_t digit = (uint32_t) (*current_p++ - LIT_CHAR_0);
536	uint32_t new_value = value * `10` + digit;
537
538	if (JERRY_UNLIKELY (value > UINT32_MAX / `10`) \|\| JERRY_UNLIKELY (new_value < value))
539	{
540	value = UINT32_MAX;
541	continue;
542	}
543
544	value = new_value;
545	}
546
547	*buffer_p = current_p;
548	return value;
549	} / lit_parse_decimal /
550
551	/**
552	* Check if specified character is a word character (part of IsWordChar abstract operation)
553	*
554	* See also: ECMA-262 v5, 15.10.2.6 (IsWordChar)
555	*
556	* @return true - if the character is a word character
557	* false - otherwise
558	*/
559	bool
560	lit_char_is_word_char (lit_code_point_t c) /< code point /*
561	{
562	return ((c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END)
563	\|\| (c >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END)
564	\|\| (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
565	\|\| c == LIT_CHAR_UNDERSCORE);
566	} / lit_char_is_word_char /
567
568	#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
569
570	/**
571	* Check if the specified character is in one of those tables which contain bidirectional conversions.
572	*
573	* @return codepoint of the converted character if it is found the the tables
574	* LIT_INVALID_CP - otherwise.
575	*/
576	static lit_code_point_t
577	lit_search_in_bidirectional_conversion_tables (lit_code_point_t cp, /< code point /*
578	bool is_lowercase) /< is lowercase conversion /*
579	{
580	/ 1, Check if the specified character is part of the lit_unicode_character_case_ranges_{sup} table. /
581	int number_of_case_ranges;
582	#if ENABLED (JERRY_ESNEXT)
583	bool is_supplementary = cp > LIT_UTF16_CODE_UNIT_MAX;
584	if (is_supplementary)
585	{
586	number_of_case_ranges = NUM_OF_ELEMENTS (lit_unicode_character_case_ranges_sup);
587	}
588	else
589	#endif /* ENABLED (JERRY_ESNEXT) */
590	{
591	number_of_case_ranges = NUM_OF_ELEMENTS (lit_unicode_character_case_ranges);
592	}
593
594	int conv_counter = `0`;
595
596	for (int i = `0`; i < number_of_case_ranges; i++)
597	{
598	if (i % `2` == `0` && i > `0`)
599	{
600	conv_counter++;
601	}
602
603	size_t range_length;
604	lit_code_point_t start_point;
605	#if ENABLED (JERRY_ESNEXT)
606	if (is_supplementary)
607	{
608	range_length = lit_unicode_character_case_range_lengths_sup[conv_counter];
609	start_point = lit_unicode_character_case_ranges_sup[i];
610	}
611	else
612	#endif /* ENABLED (JERRY_ESNEXT) */
613	{
614	range_length = lit_unicode_character_case_range_lengths[conv_counter];
615	start_point = lit_unicode_character_case_ranges[i];
616	}
617
618	if (start_point > cp \|\| cp >= start_point + range_length)
619	{
620	continue;
621	}
622
623	uint32_t char_dist = (uint32_t) cp - start_point;
624	int offset;
625	if (i % `2` == `0`)
626	{
627	if (!is_lowercase)
628	{
629	return cp;
630	}
631
632	offset = i + `1`;
633	}
634	else
635	{
636	if (is_lowercase)
637	{
638	return cp;
639	}
640
641	offset = i - `1`;
642	}
643
644	#if ENABLED (JERRY_ESNEXT)
645	if (is_supplementary)
646	{
647	start_point = lit_unicode_character_case_ranges_sup[offset];
648	}
649	else
650	#endif /* ENABLED (JERRY_ESNEXT) */
651	{
652	start_point = lit_unicode_character_case_ranges[offset];
653	}
654
655	return (lit_code_point_t) (start_point + char_dist);
656	}
657
658	/ Note: After this point based on the latest unicode standard(13.0.0.6) no conversion characters are*
659	defined for supplementary planes /*
660	#if ENABLED (JERRY_ESNEXT)
661	if (is_supplementary)
662	{
663	return cp;
664	}
665	#endif /* ENABLED (JERRY_ESNEXT) */
666
667	/ 2, Check if the specified character is part of the character_pair_ranges table. /
668	int bottom = `0`;
669	int top = NUM_OF_ELEMENTS (lit_unicode_character_pair_ranges) - `1`;
670
671	while (bottom <= top)
672	{
673	int middle = (bottom + top) / `2`;
674	lit_code_point_t current_sp = lit_unicode_character_pair_ranges[middle];
675
676	if (current_sp <= cp && cp < current_sp + lit_unicode_character_pair_range_lengths[middle])
677	{
678	uint32_t char_dist = (uint32_t) (cp - current_sp);
679
680	if ((cp - current_sp) % `2` == `0`)
681	{
682	return is_lowercase ? (lit_code_point_t) (current_sp + char_dist + `1`) : cp;
683	}
684
685	return is_lowercase ? cp : (lit_code_point_t) (current_sp + char_dist - `1`);
686	}
687
688	if (cp > current_sp)
689	{
690	bottom = middle + `1`;
691	}
692	else
693	{
694	top = middle - `1`;
695	}
696	}
697
698	/ 3, Check if the specified character is part of the character_pairs table. /
699	int number_of_character_pairs = NUM_OF_ELEMENTS (lit_unicode_character_pairs);
700
701	for (int i = `0`; i < number_of_character_pairs; i++)
702	{
703	if (cp != lit_unicode_character_pairs[i])
704	{
705	continue;
706	}
707
708	if (i % `2` == `0`)
709	{
710	return is_lowercase ? lit_unicode_character_pairs[i + `1`] : cp;
711	}
712
713	return is_lowercase ? cp : lit_unicode_character_pairs[i - `1`];
714	}
715
716	return LIT_INVALID_CP;
717	} / lit_search_in_bidirectional_conversion_tables /
718
719	/**
720	* Check if the specified character is in the given conversion table.
721	*
722	* @return LIT_MULTIPLE_CU if the converted character consist more than a single code unit
723	* converted code point - otherwise
724	*/
725	static lit_code_point_t
726	lit_search_in_conversion_table (ecma_char_t character, /< code unit /*
727	ecma_stringbuilder_t builder_p, /*< string builder /*
728	const ecma_char_t array, /*< array /*
729	const uint8_t counters) /*< case_values counter /*
730	{
731	int end_point = `0`;
732
733	for (int i = `0`; i < `3`; i++)
734	{
735	int start_point = end_point;
736	int size_of_case_value = i + `1`;
737	end_point += counters[i] * (size_of_case_value + `1`);
738
739	int bottom = start_point;
740	int top = end_point - size_of_case_value;
741
742	while (bottom <= top)
743	{
744	int middle = (bottom + top) / `2`;
745
746	middle -= ((middle - bottom) % (size_of_case_value + `1`));
747
748	ecma_char_t current = array[middle];
749
750	if (current == character)
751	{
752	if (builder_p != NULL)
753	{
754	ecma_stringbuilder_append_char (builder_p, array[middle + `1`]);
755
756	if (size_of_case_value > `1`)
757	{
758	ecma_stringbuilder_append_char (builder_p, array[middle + `2`]);
759	}
760	if (size_of_case_value > `2`)
761	{
762	ecma_stringbuilder_append_char (builder_p, array[middle + `3`]);
763	}
764	}
765
766	return size_of_case_value == `1` ? array[middle + `1`]: LIT_MULTIPLE_CU;
767	}
768
769	if (character < current)
770	{
771	top = middle - (size_of_case_value + `1`);
772	}
773	else
774	{
775	bottom = middle + (size_of_case_value + `1`);
776	}
777	}
778	}
779
780	if (builder_p != NULL)
781	{
782	ecma_stringbuilder_append_char (builder_p, character);
783	}
784
785	return (lit_code_point_t) character;
786	} / lit_search_in_conversion_table /
787	#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
788
789	/**
790	* Append the converted lowercase codeunit sequence of an a given codepoint into the stringbuilder if it is present.
791	*
792	* @return LIT_MULTIPLE_CU if the converted codepoint consist more than a single code unit
793	* converted code point - otherwise
794	*/
795	lit_code_point_t
796	lit_char_to_lower_case (lit_code_point_t cp, /< code point /*
797	ecma_stringbuilder_t builder_p) /*< string builder /*
798	{
799	if (cp <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
800	{
801	if (cp >= LIT_CHAR_UPPERCASE_A && cp <= LIT_CHAR_UPPERCASE_Z)
802	{
803	cp = (lit_utf8_byte_t) (cp + (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
804	}
805
806	if (builder_p != NULL)
807	{
808	ecma_stringbuilder_append_byte (builder_p, (lit_utf8_byte_t) cp);
809	}
810
811	return cp;
812	}
813
814	#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
815	lit_code_point_t lowercase_cp = lit_search_in_bidirectional_conversion_tables (cp, true);
816
817	if (lowercase_cp != LIT_INVALID_CP)
818	{
819	if (builder_p != NULL)
820	{
821	ecma_stringbuilder_append_codepoint (builder_p, lowercase_cp);
822	}
823
824	return lowercase_cp;
825	}
826
827	JERRY_ASSERT (cp < LIT_UTF8_4_BYTE_CODE_POINT_MIN);
828
829	int num_of_lowercase_ranges = NUM_OF_ELEMENTS (lit_unicode_lower_case_ranges);
830
831	for (int i = `0`, j = `0`; i < num_of_lowercase_ranges; i += `2`, j++)
832	{
833	JERRY_ASSERT (lit_unicode_lower_case_range_lengths[j] > `0`);
834	uint32_t range_length = (uint32_t) (lit_unicode_lower_case_range_lengths[j] - `1`);
835	lit_code_point_t start_point = lit_unicode_lower_case_ranges[i];
836
837	if (start_point <= cp && cp <= start_point + range_length)
838	{
839	lowercase_cp = lit_unicode_lower_case_ranges[i + `1`] + (cp - start_point);
840	if (builder_p != NULL)
841	{
842	ecma_stringbuilder_append_codepoint (builder_p, lowercase_cp);
843	}
844
845	return lowercase_cp;
846	}
847	}
848
849	return lit_search_in_conversion_table ((ecma_char_t) cp,
850	builder_p,
851	lit_unicode_lower_case_conversions,
852	lit_unicode_lower_case_conversion_counters);
853	#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
854	if (builder_p != NULL)
855	{
856	ecma_stringbuilder_append_codepoint (builder_p, cp);
857	}
858
859	return cp;
860	#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
861	} / lit_char_to_lower_case /
862
863	/**
864	* Append the converted uppercase codeunit sequence of an a given codepoint into the stringbuilder if it is present.
865	*
866	* @return LIT_MULTIPLE_CU if the converted codepoint consist more than a single code unit
867	* converted code point - otherwise
868	*/
869	lit_code_point_t
870	lit_char_to_upper_case (lit_code_point_t cp, /< code point /*
871	ecma_stringbuilder_t builder_p) /*< string builder /*
872	{
873	if (cp <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
874	{
875	if (cp >= LIT_CHAR_LOWERCASE_A && cp <= LIT_CHAR_LOWERCASE_Z)
876	{
877	cp = (lit_utf8_byte_t) (cp - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
878	}
879
880	if (builder_p != NULL)
881	{
882	ecma_stringbuilder_append_byte (builder_p, (lit_utf8_byte_t) cp);
883	}
884
885	return cp;
886	}
887
888	#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
889	lit_code_point_t uppercase_cp = lit_search_in_bidirectional_conversion_tables (cp, false);
890
891	if (uppercase_cp != LIT_INVALID_CP)
892	{
893	if (builder_p != NULL)
894	{
895	ecma_stringbuilder_append_codepoint (builder_p, uppercase_cp);
896	}
897
898	return uppercase_cp;
899	}
900
901	int num_of_upper_case_special_ranges = NUM_OF_ELEMENTS (lit_unicode_upper_case_special_ranges);
902
903	for (int i = `0`, j = `0`; i < num_of_upper_case_special_ranges; i += `3`, j++)
904	{
905	uint32_t range_length = lit_unicode_upper_case_special_range_lengths[j];
906	ecma_char_t start_point = lit_unicode_upper_case_special_ranges[i];
907
908	if (start_point <= cp && cp <= start_point + range_length)
909	{
910	if (builder_p != NULL)
911	{
912	uppercase_cp = lit_unicode_upper_case_special_ranges[i + `1`] + (cp - start_point);
913	ecma_stringbuilder_append_codepoint (builder_p, uppercase_cp);
914	ecma_stringbuilder_append_codepoint (builder_p, lit_unicode_upper_case_special_ranges[i + `2`]);
915	}
916
917	return LIT_MULTIPLE_CU;
918	}
919	}
920
921	return lit_search_in_conversion_table ((ecma_char_t) cp,
922	builder_p,
923	lit_unicode_upper_case_conversions,
924	lit_unicode_upper_case_conversion_counters);
925	#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
926	if (builder_p != NULL)
927	{
928	ecma_stringbuilder_append_codepoint (builder_p, cp);
929	}
930
931	return cp;
932	#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
933	} / lit_char_to_upper_case /
934
935	#if ENABLED (JERRY_ESNEXT)
936	/*
937	* Look up whether the character should be folded to the lowercase variant.
938	*
939	* @return true, if character should be lowercased
940	* false, otherwise
941	*/
942	bool
943	lit_char_fold_to_lower (lit_code_point_t cp) /< code point /*
944	{
945	#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
946	return (cp <= LIT_UTF8_1_BYTE_CODE_POINT_MAX
947	\|\| cp > LIT_UTF16_CODE_UNIT_MAX
948	\|\| (!lit_search_char_in_interval_array ((ecma_char_t) cp,
949	lit_unicode_folding_skip_to_lower_interval_starts,
950	lit_unicode_folding_skip_to_lower_interval_lengths,
951	NUM_OF_ELEMENTS (lit_unicode_folding_skip_to_lower_interval_starts))
952	&& !lit_search_char_in_array ((ecma_char_t) cp,
953	lit_unicode_folding_skip_to_lower_chars,
954	NUM_OF_ELEMENTS (lit_unicode_folding_skip_to_lower_chars))));
955	#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
956	return true;
957	#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
958	} / lit_char_fold_to_lower /
959
960	/*
961	* Look up whether the character should be folded to the uppercase variant.
962	*
963	* @return true, if character should be uppercased
964	* false, otherwise
965	*/
966	bool
967	lit_char_fold_to_upper (lit_code_point_t cp) /< code point /*
968	{
969	#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
970	return (cp > LIT_UTF8_1_BYTE_CODE_POINT_MAX
971	&& cp <= LIT_UTF16_CODE_UNIT_MAX
972	&& (lit_search_char_in_interval_array ((ecma_char_t) cp,
973	lit_unicode_folding_to_upper_interval_starts,
974	lit_unicode_folding_to_upper_interval_lengths,
975	NUM_OF_ELEMENTS (lit_unicode_folding_to_upper_interval_starts))
976	\|\| lit_search_char_in_array ((ecma_char_t) cp,
977	lit_unicode_folding_to_upper_chars,
978	NUM_OF_ELEMENTS (lit_unicode_folding_to_upper_chars))));
979	#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
980	return false;
981	#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
982	} / lit_char_fold_to_upper /
983	#endif /* ENABLED (JERRY_ESNEXT) */
984
985	/**
986	* Helper method to find a specific character in a string
987	*
988	* Used by:
989	* ecma_builtin_string_prototype_object_replace_helper
990	*
991	* @return true - if the given character is in the string
992	* false - otherwise
993	*/
994	bool
995	lit_find_char_in_string (ecma_string_t str_p, /*< source string /*
996	lit_utf8_byte_t c) /< character to find/*
997	{
998	ECMA_STRING_TO_UTF8_STRING (str_p, start_p, start_size);
999
1000	const lit_utf8_byte_t *str_curr_p = start_p;
1001	const lit_utf8_byte_t *str_end_p = start_p + start_size;
1002	bool have_char = false;
1003
1004	while (str_curr_p < str_end_p)
1005	{
1006	if (*str_curr_p++ == c)
1007	{
1008	have_char = true;
1009	break;
1010	}
1011	}
1012
1013	ECMA_FINALIZE_UTF8_STRING (start_p, start_size);
1014
1015	return have_char;
1016	} / lit_find_char_in_string /
1017

Browse the source code of jerryscript/jerry-core/lit/lit-char-helpers.c