lit-strings.c source code [jerryscript/jerry-core/lit/lit-strings.c]

1	/ Copyright JS Foundation and other contributors, http://js.foundation*
2	*
3	* Licensed under the Apache License, Version 2.0 (the "License");
4	* you may not use this file except in compliance with the License.
5	* You may obtain a copy of the License at
6	*
7	* http://www.apache.org/licenses/LICENSE-2.0
8	*
9	* Unless required by applicable law or agreed to in writing, software
10	* distributed under the License is distributed on an "AS IS" BASIS
11	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	* See the License for the specific language governing permissions and
13	* limitations under the License.
14	*/
15
16	#include "lit-strings.h"
17
18	#include "jrt-libc-includes.h"
19
20	/**
21	* Validate utf-8 string
22	*
23	* NOTE:
24	* Isolated surrogates are allowed.
25	* Correct pair of surrogates is not allowed, it should be represented as 4-byte utf-8 character.
26	*
27	* @return true if utf-8 string is well-formed
28	* false otherwise
29	*/
30	bool
31	lit_is_valid_utf8_string (const lit_utf8_byte_t utf8_buf_p, /*< utf-8 string /*
32	lit_utf8_size_t buf_size) /< string size /*
33	{
34	lit_utf8_size_t idx = `0`;
35
36	bool is_prev_code_point_high_surrogate = false;
37	while (idx < buf_size)
38	{
39	lit_utf8_byte_t c = utf8_buf_p[idx++];
40	if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
41	{
42	is_prev_code_point_high_surrogate = false;
43	continue;
44	}
45
46	lit_code_point_t code_point = `0`;
47	lit_code_point_t min_code_point = `0`;
48	lit_utf8_size_t extra_bytes_count;
49	if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
50	{
51	extra_bytes_count = `1`;
52	min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN;
53	code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
54	}
55	else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
56	{
57	extra_bytes_count = `2`;
58	min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN;
59	code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
60	}
61	else if ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER)
62	{
63	extra_bytes_count = `3`;
64	min_code_point = LIT_UTF8_4_BYTE_CODE_POINT_MIN;
65	code_point = ((uint32_t) (c & LIT_UTF8_LAST_3_BITS_MASK));
66	}
67	else
68	{
69	/ utf-8 string could not contain 5- and 6-byte sequences. /
70	return false;
71	}
72
73	if (idx + extra_bytes_count > buf_size)
74	{
75	/ utf-8 string breaks in the middle /
76	return false;
77	}
78
79	for (lit_utf8_size_t offset = `0`; offset < extra_bytes_count; ++offset)
80	{
81	c = utf8_buf_p[idx + offset];
82	if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
83	{
84	/ invalid continuation byte /
85	return false;
86	}
87	code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
88	code_point \|= (c & LIT_UTF8_LAST_6_BITS_MASK);
89	}
90
91	if (code_point < min_code_point
92	\|\| code_point > LIT_UNICODE_CODE_POINT_MAX)
93	{
94	/ utf-8 string doesn't encode valid unicode code point /
95	return false;
96	}
97
98	if (code_point >= LIT_UTF16_HIGH_SURROGATE_MIN
99	&& code_point <= LIT_UTF16_HIGH_SURROGATE_MAX)
100	{
101	is_prev_code_point_high_surrogate = true;
102	}
103	else if (code_point >= LIT_UTF16_LOW_SURROGATE_MIN
104	&& code_point <= LIT_UTF16_LOW_SURROGATE_MAX
105	&& is_prev_code_point_high_surrogate)
106	{
107	/ sequence of high and low surrogate is not allowed /
108	return false;
109	}
110	else
111	{
112	is_prev_code_point_high_surrogate = false;
113	}
114
115	idx += extra_bytes_count;
116	}
117
118	return true;
119	} / lit_is_valid_utf8_string /
120
121	/**
122	* Validate cesu-8 string
123	*
124	* @return true if cesu-8 string is well-formed
125	* false otherwise
126	*/
127	bool
128	lit_is_valid_cesu8_string (const lit_utf8_byte_t cesu8_buf_p, /*< cesu-8 string /*
129	lit_utf8_size_t buf_size) /< string size /*
130	{
131	lit_utf8_size_t idx = `0`;
132
133	while (idx < buf_size)
134	{
135	lit_utf8_byte_t c = cesu8_buf_p[idx++];
136	if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
137	{
138	continue;
139	}
140
141	lit_code_point_t code_point = `0`;
142	lit_code_point_t min_code_point = `0`;
143	lit_utf8_size_t extra_bytes_count;
144	if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
145	{
146	extra_bytes_count = `1`;
147	min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN;
148	code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
149	}
150	else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
151	{
152	extra_bytes_count = `2`;
153	min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN;
154	code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
155	}
156	else
157	{
158	return false;
159	}
160
161	if (idx + extra_bytes_count > buf_size)
162	{
163	/ cesu-8 string breaks in the middle /
164	return false;
165	}
166
167	for (lit_utf8_size_t offset = `0`; offset < extra_bytes_count; ++offset)
168	{
169	c = cesu8_buf_p[idx + offset];
170	if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
171	{
172	/ invalid continuation byte /
173	return false;
174	}
175	code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
176	code_point \|= (c & LIT_UTF8_LAST_6_BITS_MASK);
177	}
178
179	if (code_point < min_code_point)
180	{
181	/ cesu-8 string doesn't encode valid unicode code point /
182	return false;
183	}
184
185	idx += extra_bytes_count;
186	}
187
188	return true;
189	} / lit_is_valid_cesu8_string /
190
191	/**
192	* Check if the code point is UTF-16 low surrogate
193	*
194	* @return true / false
195	*/
196	bool
197	lit_is_code_point_utf16_low_surrogate (lit_code_point_t code_point) /< code point /*
198	{
199	return LIT_UTF16_LOW_SURROGATE_MIN <= code_point && code_point <= LIT_UTF16_LOW_SURROGATE_MAX;
200	} / lit_is_code_point_utf16_low_surrogate /
201
202	/**
203	* Check if the code point is UTF-16 high surrogate
204	*
205	* @return true / false
206	*/
207	bool
208	lit_is_code_point_utf16_high_surrogate (lit_code_point_t code_point) /< code point /*
209	{
210	return LIT_UTF16_HIGH_SURROGATE_MIN <= code_point && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX;
211	} / lit_is_code_point_utf16_high_surrogate /
212
213	/**
214	* Represents code point (>0xFFFF) as surrogate pair and returns its lower part
215	*
216	* @return lower code_unit of the surrogate pair
217	*/
218	static ecma_char_t
219	convert_code_point_to_low_surrogate (lit_code_point_t code_point) /< code point, should be > 0xFFFF /*
220	{
221	JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX);
222
223	ecma_char_t code_unit_bits;
224	code_unit_bits = (ecma_char_t) (code_point & LIT_UTF16_LAST_10_BITS_MASK);
225
226	return (ecma_char_t) (LIT_UTF16_LOW_SURROGATE_MARKER \| code_unit_bits);
227	} / convert_code_point_to_low_surrogate /
228
229	/**
230	* Represents code point (>0xFFFF) as surrogate pair and returns its higher part
231	*
232	* @return higher code_unit of the surrogate pair
233	*/
234	static ecma_char_t
235	convert_code_point_to_high_surrogate (lit_code_point_t code_point) /< code point, should be > 0xFFFF /*
236	{
237	JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX);
238	JERRY_ASSERT (code_point <= LIT_UNICODE_CODE_POINT_MAX);
239
240	ecma_char_t code_unit_bits;
241	code_unit_bits = (ecma_char_t) ((code_point - LIT_UTF16_FIRST_SURROGATE_CODE_POINT) >> LIT_UTF16_BITS_IN_SURROGATE);
242
243	return (LIT_UTF16_HIGH_SURROGATE_MARKER \| code_unit_bits);
244	} / convert_code_point_to_high_surrogate /
245
246	/**
247	* UTF16 Encoding method for a code point
248	*
249	* See also:
250	* ECMA-262 v6, 10.1.1
251	*
252	* @return uint8_t, the number of returning code points
253	*/
254	uint8_t
255	lit_utf16_encode_code_point (lit_code_point_t cp, /< the code point we encode /*
256	ecma_char_t cu_p) /*< result of the encoding /*
257	{
258	if (cp <= LIT_UTF16_CODE_UNIT_MAX)
259	{
260	cu_p[`0`] = (ecma_char_t) cp;
261	return `1`;
262	}
263
264	cu_p[`0`] = convert_code_point_to_high_surrogate (cp);
265	cu_p[`1`] = convert_code_point_to_low_surrogate (cp);
266	return `2`;
267	} / lit_utf16_encode_code_point /
268
269	/**
270	* Calculate size of a zero-terminated utf-8 string
271	*
272	* NOTE:
273	* - string cannot be NULL
274	* - string should not contain zero characters in the middle
275	*
276	* @return size of a string
277	*/
278	lit_utf8_size_t
279	lit_zt_utf8_string_size (const lit_utf8_byte_t utf8_str_p) /*< zero-terminated utf-8 string /*
280	{
281	JERRY_ASSERT (utf8_str_p != NULL);
282	return (lit_utf8_size_t) strlen ((const char *) utf8_str_p);
283	} / lit_zt_utf8_string_size /
284
285	/**
286	* Calculate length of a cesu-8 encoded string
287	*
288	* @return UTF-16 code units count
289	*/
290	lit_utf8_size_t
291	lit_utf8_string_length (const lit_utf8_byte_t utf8_buf_p, /*< utf-8 string /*
292	lit_utf8_size_t utf8_buf_size) /< string size /*
293	{
294	lit_utf8_size_t length = `0`;
295	lit_utf8_size_t size = `0`;
296
297	while (size < utf8_buf_size)
298	{
299	size += lit_get_unicode_char_size_by_utf8_first_byte (*(utf8_buf_p + size));
300	length++;
301	}
302
303	JERRY_ASSERT (size == utf8_buf_size);
304
305	return length;
306	} / lit_utf8_string_length /
307
308	/**
309	* Calculate the required size of an utf-8 encoded string from cesu-8 encoded string
310	*
311	* @return size of an utf-8 encoded string
312	*/
313	lit_utf8_size_t
314	lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t cesu8_buf_p, /*< cesu-8 string /*
315	lit_utf8_size_t cesu8_buf_size) /< string size /*
316	{
317	lit_utf8_size_t offset = `0`;
318	lit_utf8_size_t utf8_buf_size = cesu8_buf_size;
319	ecma_char_t prev_ch = `0`;
320
321	while (offset < cesu8_buf_size)
322	{
323	ecma_char_t ch;
324	offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch);
325
326	if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch))
327	{
328	utf8_buf_size -= `2`;
329	}
330
331	prev_ch = ch;
332	}
333
334	JERRY_ASSERT (offset == cesu8_buf_size);
335
336	return utf8_buf_size;
337	} / lit_get_utf8_size_of_cesu8_string /
338
339	/**
340	* Calculate length of an utf-8 encoded string from cesu-8 encoded string
341	*
342	* @return length of an utf-8 encoded string
343	*/
344	lit_utf8_size_t
345	lit_get_utf8_length_of_cesu8_string (const lit_utf8_byte_t cesu8_buf_p, /*< cesu-8 string /*
346	lit_utf8_size_t cesu8_buf_size) /< string size /*
347	{
348	lit_utf8_size_t offset = `0`;
349	lit_utf8_size_t utf8_length = `0`;
350	ecma_char_t prev_ch = `0`;
351
352	while (offset < cesu8_buf_size)
353	{
354	ecma_char_t ch;
355	offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch);
356
357	if (!lit_is_code_point_utf16_low_surrogate (ch) \|\| !lit_is_code_point_utf16_high_surrogate (prev_ch))
358	{
359	utf8_length++;
360	}
361
362	prev_ch = ch;
363	}
364
365	JERRY_ASSERT (offset == cesu8_buf_size);
366
367	return utf8_length;
368	} / lit_get_utf8_length_of_cesu8_string /
369
370	/**
371	* Decodes a unicode code point from non-empty utf-8-encoded buffer
372	*
373	* @return number of bytes occupied by code point in the string
374	*/
375	lit_utf8_size_t
376	lit_read_code_point_from_utf8 (const lit_utf8_byte_t buf_p, /*< buffer with characters /*
377	lit_utf8_size_t buf_size, /< size of the buffer in bytes /*
378	lit_code_point_t code_point) /*< [out] code point /*
379	{
380	JERRY_ASSERT (buf_p && buf_size);
381
382	lit_utf8_byte_t c = buf_p[`0`];
383	if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
384	{
385	*code_point = (lit_code_point_t) (c & LIT_UTF8_LAST_7_BITS_MASK);
386	return `1`;
387	}
388
389	lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL;
390	lit_utf8_size_t bytes_count = `0`;
391	if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
392	{
393	bytes_count = `2`;
394	ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
395	}
396	else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
397	{
398	bytes_count = `3`;
399	ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
400	}
401	else
402	{
403	JERRY_ASSERT ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER);
404	bytes_count = `4`;
405	ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_3_BITS_MASK));
406	}
407
408	JERRY_ASSERT (buf_size >= bytes_count);
409
410	for (uint32_t i = `1`; i < bytes_count; ++i)
411	{
412	ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
413	ret \|= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK);
414	}
415
416	*code_point = ret;
417	return bytes_count;
418	} / lit_read_code_point_from_utf8 /
419
420	/**
421	* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
422	*
423	* @return number of bytes occupied by code point in the string
424	*/
425	lit_utf8_size_t
426	lit_read_code_unit_from_utf8 (const lit_utf8_byte_t buf_p, /*< buffer with characters /*
427	ecma_char_t code_point) /*< [out] code point /*
428	{
429	JERRY_ASSERT (buf_p);
430
431	lit_utf8_byte_t c = buf_p[`0`];
432	if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
433	{
434	*code_point = (ecma_char_t) (c & LIT_UTF8_LAST_7_BITS_MASK);
435	return `1`;
436	}
437
438	lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL;
439	lit_utf8_size_t bytes_count;
440	if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
441	{
442	bytes_count = `2`;
443	ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
444	}
445	else
446	{
447	JERRY_ASSERT ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER);
448	bytes_count = `3`;
449	ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
450	}
451
452	for (uint32_t i = `1`; i < bytes_count; ++i)
453	{
454	ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
455	ret \|= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK);
456	}
457
458	JERRY_ASSERT (ret <= LIT_UTF16_CODE_UNIT_MAX);
459	*code_point = (ecma_char_t) ret;
460	return bytes_count;
461	} / lit_read_code_unit_from_utf8 /
462
463	/**
464	* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
465	*
466	* @return number of bytes occupied by code point in the string
467	*/
468	lit_utf8_size_t
469	lit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t buf_p, /*< buffer with characters /*
470	ecma_char_t code_point) /*< [out] code point /*
471	{
472	JERRY_ASSERT (buf_p);
473
474	lit_utf8_decr (&buf_p);
475	return lit_read_code_unit_from_utf8 (buf_p, code_point);
476	} / lit_read_prev_code_unit_from_utf8 /
477
478	/**
479	* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
480	*
481	* @return next code unit
482	*/
483	ecma_char_t
484	lit_cesu8_read_next (const lit_utf8_byte_t *buf_p) /*< [in,out] buffer with characters /*
485	{
486	JERRY_ASSERT (*buf_p);
487	ecma_char_t ch;
488
489	buf_p += lit_read_code_unit_from_utf8 (buf_p, &ch);
490
491	return ch;
492	} / lit_cesu8_read_next /
493
494	/**
495	* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
496	*
497	* @return previous code unit
498	*/
499	ecma_char_t
500	lit_cesu8_read_prev (const lit_utf8_byte_t *buf_p) /*< [in,out] buffer with characters /*
501	{
502	JERRY_ASSERT (*buf_p);
503	ecma_char_t ch;
504
505	lit_utf8_decr (buf_p);
506	lit_read_code_unit_from_utf8 (*buf_p, &ch);
507
508	return ch;
509	} / lit_cesu8_read_prev /
510
511	/**
512	* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
513	*
514	* @return next code unit
515	*/
516	ecma_char_t JERRY_ATTR_NOINLINE
517	lit_cesu8_peek_next (const lit_utf8_byte_t buf_p) /*< [in,out] buffer with characters /*
518	{
519	JERRY_ASSERT (buf_p != NULL);
520	ecma_char_t ch;
521
522	lit_read_code_unit_from_utf8 (buf_p, &ch);
523
524	return ch;
525	} / lit_cesu8_peek_next /
526
527	/**
528	* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
529	*
530	* @return previous code unit
531	*/
532	ecma_char_t JERRY_ATTR_NOINLINE
533	lit_cesu8_peek_prev (const lit_utf8_byte_t buf_p) /*< [in,out] buffer with characters /*
534	{
535	JERRY_ASSERT (buf_p != NULL);
536	ecma_char_t ch;
537
538	lit_read_prev_code_unit_from_utf8 (buf_p, &ch);
539
540	return ch;
541	} / lit_cesu8_peek_prev /
542
543	/**
544	* Increase cesu-8 encoded string pointer by one code unit.
545	*/
546	extern inline void JERRY_ATTR_ALWAYS_INLINE
547	lit_utf8_incr (const lit_utf8_byte_t *buf_p) /*< [in,out] buffer with characters /*
548	{
549	JERRY_ASSERT (*buf_p);
550
551	buf_p += lit_get_unicode_char_size_by_utf8_first_byte (*buf_p);
552	} / lit_utf8_incr /
553
554	/**
555	* Decrease cesu-8 encoded string pointer by one code unit.
556	*/
557	void
558	lit_utf8_decr (const lit_utf8_byte_t *buf_p) /*< [in,out] buffer with characters /*
559	{
560	JERRY_ASSERT (*buf_p);
561	const lit_utf8_byte_t current_p = buf_p;
562
563	do
564	{
565	current_p--;
566	}
567	while ((*(current_p) & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER);
568
569	*buf_p = current_p;
570	} / lit_utf8_decr /
571
572	/**
573	* Calc hash using the specified hash_basis.
574	*
575	* NOTE:
576	* This is implementation of FNV-1a hash function, which is released into public domain.
577	* Constants used, are carefully picked primes by the authors.
578	* More info: http://www.isthe.com/chongo/tech/comp/fnv/
579	*
580	* @return ecma-string's hash
581	*/
582	extern inline lit_string_hash_t JERRY_ATTR_ALWAYS_INLINE
583	lit_utf8_string_hash_combine (lit_string_hash_t hash_basis, /< hash to be combined with /*
584	const lit_utf8_byte_t utf8_buf_p, /*< characters buffer /*
585	lit_utf8_size_t utf8_buf_size) /< number of characters in the buffer /*
586	{
587	JERRY_ASSERT (utf8_buf_p != NULL \|\| utf8_buf_size == `0`);
588
589	uint32_t hash = hash_basis;
590
591	for (uint32_t i = `0`; i < utf8_buf_size; i++)
592	{
593	/ 16777619 is 32 bit FNV_prime = 2^24 + 2^8 + 0x93 = 16777619 /
594	hash = (hash ^ utf8_buf_p[i]) * `16777619`;
595	}
596
597	return (lit_string_hash_t) hash;
598	} / lit_utf8_string_hash_combine /
599
600	/**
601	* Calculate hash from the buffer.
602	*
603	* @return ecma-string's hash
604	*/
605	extern inline lit_string_hash_t JERRY_ATTR_ALWAYS_INLINE
606	lit_utf8_string_calc_hash (const lit_utf8_byte_t utf8_buf_p, /*< characters buffer /*
607	lit_utf8_size_t utf8_buf_size) /< number of characters in the buffer /*
608	{
609	JERRY_ASSERT (utf8_buf_p != NULL \|\| utf8_buf_size == `0`);
610
611	/ 32 bit offset_basis for FNV = 2166136261 /
612	return lit_utf8_string_hash_combine ((lit_string_hash_t) `2166136261`, utf8_buf_p, utf8_buf_size);
613	} / lit_utf8_string_calc_hash /
614
615	/**
616	* Return code unit at the specified position in string
617	*
618	* NOTE:
619	* code_unit_offset should be less then string's length
620	*
621	* @return code unit value
622	*/
623	ecma_char_t
624	lit_utf8_string_code_unit_at (const lit_utf8_byte_t utf8_buf_p, /*< utf-8 string /*
625	lit_utf8_size_t utf8_buf_size, /< string size in bytes /*
626	lit_utf8_size_t code_unit_offset) /< ofset of a code_unit /*
627	{
628	lit_utf8_byte_t current_p = (lit_utf8_byte_t ) utf8_buf_p;
629	ecma_char_t code_unit;
630
631	do
632	{
633	JERRY_ASSERT (current_p < utf8_buf_p + utf8_buf_size);
634	current_p += lit_read_code_unit_from_utf8 (current_p, &code_unit);
635	}
636	while (code_unit_offset--);
637
638	return code_unit;
639	} / lit_utf8_string_code_unit_at /
640
641	/**
642	* Get CESU-8 encoded size of character
643	*
644	* @return number of bytes occupied in CESU-8
645	*/
646	extern inline lit_utf8_size_t JERRY_ATTR_ALWAYS_INLINE
647	lit_get_unicode_char_size_by_utf8_first_byte (const lit_utf8_byte_t first_byte) /< buffer with characters /*
648	{
649	if ((first_byte & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
650	{
651	return `1`;
652	}
653	else if ((first_byte & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
654	{
655	return `2`;
656	}
657	else
658	{
659	JERRY_ASSERT ((first_byte & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER);
660	return `3`;
661	}
662	} / lit_get_unicode_char_size_by_utf8_first_byte /
663
664	/**
665	* Convert code unit to cesu-8 representation
666	*
667	* @return byte count required to represent the code unit
668	*/
669	lit_utf8_size_t
670	lit_code_unit_to_utf8 (ecma_char_t code_unit, /< code unit /*
671	lit_utf8_byte_t buf_p) /*< buffer where to store the result and its size
672	* should be at least LIT_UTF8_MAX_BYTES_IN_CODE_UNIT */
673	{
674	if (code_unit <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
675	{
676	buf_p[`0`] = (lit_utf8_byte_t) code_unit;
677	return `1`;
678	}
679	else if (code_unit <= LIT_UTF8_2_BYTE_CODE_POINT_MAX)
680	{
681	uint32_t code_unit_bits = code_unit;
682	lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
683	code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
684
685	lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_5_BITS_MASK);
686	JERRY_ASSERT (first_byte_bits == code_unit_bits);
687
688	buf_p[`0`] = LIT_UTF8_2_BYTE_MARKER \| first_byte_bits;
689	buf_p[`1`] = LIT_UTF8_EXTRA_BYTE_MARKER \| second_byte_bits;
690	return `2`;
691	}
692	else
693	{
694	uint32_t code_unit_bits = code_unit;
695	lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
696	code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
697
698	lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
699	code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
700
701	lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_4_BITS_MASK);
702	JERRY_ASSERT (first_byte_bits == code_unit_bits);
703
704	buf_p[`0`] = LIT_UTF8_3_BYTE_MARKER \| first_byte_bits;
705	buf_p[`1`] = LIT_UTF8_EXTRA_BYTE_MARKER \| second_byte_bits;
706	buf_p[`2`] = LIT_UTF8_EXTRA_BYTE_MARKER \| third_byte_bits;
707	return `3`;
708	}
709	} / lit_code_unit_to_utf8 /
710
711	/**
712	* Convert code point to cesu-8 representation
713	*
714	* @return byte count required to represent the code point
715	*/
716	lit_utf8_size_t
717	lit_code_point_to_cesu8 (lit_code_point_t code_point, /< code point /*
718	lit_utf8_byte_t buf) /*< buffer where to store the result,
719	* its size should be at least 6 bytes */
720	{
721	if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
722	{
723	return lit_code_unit_to_utf8 ((ecma_char_t) code_point, buf);
724	}
725	else
726	{
727	lit_utf8_size_t offset = lit_code_unit_to_utf8 (convert_code_point_to_high_surrogate (code_point), buf);
728	offset += lit_code_unit_to_utf8 (convert_code_point_to_low_surrogate (code_point), buf + offset);
729	return offset;
730	}
731	} / lit_code_point_to_cesu8 /
732
733	/**
734	* Convert code point to utf-8 representation
735	*
736	* @return byte count required to represent the code point
737	*/
738	lit_utf8_size_t
739	lit_code_point_to_utf8 (lit_code_point_t code_point, /< code point /*
740	lit_utf8_byte_t buf) /*< buffer where to store the result,
741	* its size should be at least 4 bytes */
742	{
743	if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
744	{
745	buf[`0`] = (lit_utf8_byte_t) code_point;
746	return `1`;
747	}
748	else if (code_point <= LIT_UTF8_2_BYTE_CODE_POINT_MAX)
749	{
750	uint32_t code_point_bits = code_point;
751	lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
752	code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
753
754	lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_5_BITS_MASK);
755	JERRY_ASSERT (first_byte_bits == code_point_bits);
756
757	buf[`0`] = LIT_UTF8_2_BYTE_MARKER \| first_byte_bits;
758	buf[`1`] = LIT_UTF8_EXTRA_BYTE_MARKER \| second_byte_bits;
759	return `2`;
760	}
761	else if (code_point <= LIT_UTF8_3_BYTE_CODE_POINT_MAX)
762	{
763	uint32_t code_point_bits = code_point;
764	lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
765	code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
766
767	lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
768	code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
769
770	lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_4_BITS_MASK);
771	JERRY_ASSERT (first_byte_bits == code_point_bits);
772
773	buf[`0`] = LIT_UTF8_3_BYTE_MARKER \| first_byte_bits;
774	buf[`1`] = LIT_UTF8_EXTRA_BYTE_MARKER \| second_byte_bits;
775	buf[`2`] = LIT_UTF8_EXTRA_BYTE_MARKER \| third_byte_bits;
776	return `3`;
777	}
778	else
779	{
780	JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MAX);
781
782	uint32_t code_point_bits = code_point;
783	lit_utf8_byte_t fourth_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
784	code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
785
786	lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
787	code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
788
789	lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
790	code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
791
792	lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_3_BITS_MASK);
793	JERRY_ASSERT (first_byte_bits == code_point_bits);
794
795	buf[`0`] = LIT_UTF8_4_BYTE_MARKER \| first_byte_bits;
796	buf[`1`] = LIT_UTF8_EXTRA_BYTE_MARKER \| second_byte_bits;
797	buf[`2`] = LIT_UTF8_EXTRA_BYTE_MARKER \| third_byte_bits;
798	buf[`3`] = LIT_UTF8_EXTRA_BYTE_MARKER \| fourth_byte_bits;
799	return `4`;
800	}
801	} / lit_code_point_to_utf8 /
802
803	/**
804	* Convert cesu-8 string to an utf-8 string and put it into the buffer.
805	* It is the caller's responsibility to make sure that the string fits in the buffer.
806	*
807	* @return number of bytes copied to the buffer.
808	*/
809	lit_utf8_size_t
810	lit_convert_cesu8_string_to_utf8_string (const lit_utf8_byte_t cesu8_string, /*< cesu-8 string /*
811	lit_utf8_size_t cesu8_size, /< size of cesu-8 string /*
812	lit_utf8_byte_t utf8_string, /*< destination utf-8 buffer pointer
813	* (can be NULL if buffer_size == 0) */
814	lit_utf8_size_t utf8_size) /< size of utf-8 buffer /*
815	{
816	const lit_utf8_byte_t *cesu8_pos = cesu8_string;
817	const lit_utf8_byte_t *cesu8_end_pos = cesu8_string + cesu8_size;
818
819	lit_utf8_byte_t *utf8_pos = utf8_string;
820	lit_utf8_byte_t *utf8_end_pos = utf8_string + utf8_size;
821
822	lit_utf8_size_t size = `0`;
823
824	ecma_char_t prev_ch = `0`;
825	lit_utf8_size_t prev_ch_size = `0`;
826
827	while (cesu8_pos < cesu8_end_pos)
828	{
829	ecma_char_t ch;
830	lit_utf8_size_t code_unit_size = lit_read_code_unit_from_utf8 (cesu8_pos, &ch);
831
832	if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch))
833	{
834	JERRY_ASSERT (code_unit_size == prev_ch_size);
835	utf8_pos -= prev_ch_size;
836	lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (prev_ch, ch);
837	lit_code_point_to_utf8 (code_point, utf8_pos);
838	size++;
839	}
840	else
841	{
842	memcpy (utf8_pos, cesu8_pos, code_unit_size);
843	size += code_unit_size;
844	}
845
846	utf8_pos = utf8_string + size;
847	cesu8_pos += code_unit_size;
848	prev_ch = ch;
849	prev_ch_size = code_unit_size;
850	}
851
852	JERRY_ASSERT (cesu8_pos == cesu8_end_pos);
853	JERRY_ASSERT (utf8_pos <= utf8_end_pos);
854
855	return size;
856	} / lit_convert_cesu8_string_to_utf8_string /
857
858	/**
859	* Convert surrogate pair to code point
860	*
861	* @return code point
862	*/
863	lit_code_point_t
864	lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, /< high surrogate code point /*
865	ecma_char_t low_surrogate) /< low surrogate code point /*
866	{
867	JERRY_ASSERT (lit_is_code_point_utf16_high_surrogate (high_surrogate));
868	JERRY_ASSERT (lit_is_code_point_utf16_low_surrogate (low_surrogate));
869
870	lit_code_point_t code_point;
871	code_point = (uint16_t) (high_surrogate - LIT_UTF16_HIGH_SURROGATE_MIN);
872	code_point <<= LIT_UTF16_BITS_IN_SURROGATE;
873
874	code_point += LIT_UTF16_FIRST_SURROGATE_CODE_POINT;
875
876	code_point \|= (uint16_t) (low_surrogate - LIT_UTF16_LOW_SURROGATE_MIN);
877	return code_point;
878	} / lit_convert_surrogate_pair_to_code_point /
879
880	/**
881	* Relational compare of cesu-8 strings
882	*
883	* First string is less than second string if:
884	* - strings are not equal;
885	* - first string is prefix of second or is lexicographically less than second.
886	*
887	* @return true - if first string is less than second string,
888	* false - otherwise
889	*/
890	bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t string1_p, /*< utf-8 string /*
891	lit_utf8_size_t string1_size, /< string size /*
892	const lit_utf8_byte_t string2_p, /*< utf-8 string /*
893	lit_utf8_size_t string2_size) /< string size /*
894	{
895	lit_utf8_byte_t string1_pos = (lit_utf8_byte_t ) string1_p;
896	lit_utf8_byte_t string2_pos = (lit_utf8_byte_t ) string2_p;
897	const lit_utf8_byte_t *string1_end_p = string1_p + string1_size;
898	const lit_utf8_byte_t *string2_end_p = string2_p + string2_size;
899
900	while (string1_pos < string1_end_p && string2_pos < string2_end_p)
901	{
902	ecma_char_t ch1, ch2;
903	string1_pos += lit_read_code_unit_from_utf8 (string1_pos, &ch1);
904	string2_pos += lit_read_code_unit_from_utf8 (string2_pos, &ch2);
905
906	if (ch1 < ch2)
907	{
908	return true;
909	}
910	else if (ch1 > ch2)
911	{
912	return false;
913	}
914	}
915
916	return (string1_pos >= string1_end_p && string2_pos < string2_end_p);
917	} / lit_compare_utf8_strings_relational /
918

Browse the source code of jerryscript/jerry-core/lit/lit-strings.c