lit-globals.h source code [jerryscript/jerry-core/lit/lit-globals.h]

1	/ Copyright JS Foundation and other contributors, http://js.foundation*
2	*
3	* Licensed under the Apache License, Version 2.0 (the "License");
4	* you may not use this file except in compliance with the License.
5	* You may obtain a copy of the License at
6	*
7	* http://www.apache.org/licenses/LICENSE-2.0
8	*
9	* Unless required by applicable law or agreed to in writing, software
10	* distributed under the License is distributed on an "AS IS" BASIS
11	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	* See the License for the specific language governing permissions and
13	* limitations under the License.
14	*/
15
16	#ifndef LIT_GLOBALS_H
17	#define LIT_GLOBALS_H
18
19	#include "jrt.h"
20
21	/**
22	* ECMAScript standard defines terms "code unit" and "character" as 16-bit unsigned value
23	* used to represent 16-bit unit of text, this is the same as code unit in UTF-16 (See ECMA-262 5.1 Chapter 6).
24	*
25	* The term "code point" or "Unicode character" is used to refer a single Unicode scalar value (may be longer
26	* than 16 bits: 0x0 - 0x10FFFFF). One code point could be represented with one ore two 16-bit code units.
27	*
28	* According to the standard all strings and source text are assumed to be a sequence of code units.
29	* Length of a string equals to number of code units in the string, which is not the same as number of Unicode
30	* characters in a string.
31	*
32	* Internally JerryScript engine uses UTF-8 representation of strings to reduce memory overhead. Unicode character
33	* occupies from one to four bytes in UTF-8 representation.
34	*
35	* Unicode scalar value \| Bytes in UTF-8 \| Bytes in UTF-16
36	* \| (internal representation) \|
37	* ----------------------------------------------------------------------
38	* 0x0 - 0x7F \| 1 byte \| 2 bytes
39	* 0x80 - 0x7FF \| 2 bytes \| 2 bytes
40	* 0x800 - 0xFFFF \| 3 bytes \| 2 bytes
41	* 0x10000 - 0x10FFFF \| 4 bytes \| 4 bytes
42	*
43	* Scalar values from 0xD800 to 0xDFFF are permanently reserved by Unicode standard to encode high and low
44	* surrogates in UTF-16 (Code points 0x10000 - 0x10FFFF are encoded via pair of surrogates in UTF-16).
45	* Despite that the official Unicode standard says that no UTF forms can encode these code points, we allow
46	* them to be encoded inside strings. The reason for that is compatibility with ECMA standard.
47	*
48	* For example, assume a string which consists one Unicode character: 0x1D700 (Mathematical Italic Small Epsilon).
49	* It has the following representation in UTF-16: 0xD835 0xDF00.
50	*
51	* ECMA standard allows extracting a substring from this string:
52	* > var str = String.fromCharCode (0xD835, 0xDF00); // Create a string containing one character: 0x1D700
53	* > str.length; // 2
54	* > var str1 = str.substring (0, 1);
55	* > str1.length; // 1
56	* > str1.charCodeAt (0); // 55349 (this equals to 0xD835)
57	*
58	* Internally original string would be represented in UTF-8 as the following byte sequence: 0xF0 0x9D 0x9C 0x80.
59	* After substring extraction high surrogate 0xD835 should be encoded via UTF-8: 0xED 0xA0 0xB5.
60	*
61	* Pair of low and high surrogates encoded separately should never occur in internal string representation,
62	* it should be encoded as any code point and occupy 4 bytes. So, when constructing a string from two surrogates,
63	* it should be processed gracefully;
64	* > var str1 = String.fromCharCode (0xD835); // 0xED 0xA0 0xB5 - internal representation
65	* > var str2 = String.fromCharCode (0xDF00); // 0xED 0xBC 0x80 - internal representation
66	* > var str = str1 + str2; // 0xF0 0x9D 0x9C 0x80 - internal representation,
67	* // !!! not 0xED 0xA0 0xB5 0xED 0xBC 0x80
68	*/
69
70	/**
71	* Description of an ecma-character, which represents 16-bit code unit,
72	* which is equal to UTF-16 character (see Chapter 6 from ECMA-262 5.1)
73	*/
74	typedef uint16_t ecma_char_t;
75
76	/**
77	* Max bytes needed to represent a code unit (utf-16 char) via utf-8 encoding
78	*/
79	#define LIT_UTF8_MAX_BYTES_IN_CODE_UNIT (3)
80
81	/**
82	* Max bytes needed to represent a code point (Unicode character) via utf-8 encoding
83	*/
84	#define LIT_UTF8_MAX_BYTES_IN_CODE_POINT (4)
85
86	/**
87	* Max bytes needed to represent a code unit (utf-16 char) via cesu-8 encoding
88	*/
89	#define LIT_CESU8_MAX_BYTES_IN_CODE_UNIT (3)
90
91	/**
92	* Max bytes needed to represent a code point (Unicode character) via cesu-8 encoding
93	*/
94	#define LIT_CESU8_MAX_BYTES_IN_CODE_POINT (6)
95
96	/**
97	* A byte of utf-8 string
98	*/
99	typedef uint8_t lit_utf8_byte_t;
100
101	/**
102	* Size of a utf-8 string in bytes
103	*/
104	typedef uint32_t lit_utf8_size_t;
105
106	/**
107	* Size of a magic string in bytes
108	*/
109	typedef uint8_t lit_magic_size_t;
110
111	/**
112	* Unicode code point
113	*/
114	typedef uint32_t lit_code_point_t;
115
116	/**
117	* ECMA string hash
118	*/
119	typedef uint32_t lit_string_hash_t;
120
121	#endif /* !LIT_GLOBALS_H */
122

Browse the source code of jerryscript/jerry-core/lit/lit-globals.h