1 | /* Copyright JS Foundation and other contributors, http://js.foundation |
2 | * |
3 | * Licensed under the Apache License, Version 2.0 (the "License"); |
4 | * you may not use this file except in compliance with the License. |
5 | * You may obtain a copy of the License at |
6 | * |
7 | * http://www.apache.org/licenses/LICENSE-2.0 |
8 | * |
9 | * Unless required by applicable law or agreed to in writing, software |
10 | * distributed under the License is distributed on an "AS IS" BASIS |
11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | * See the License for the specific language governing permissions and |
13 | * limitations under the License. |
14 | */ |
15 | |
16 | #ifndef LIT_GLOBALS_H |
17 | #define LIT_GLOBALS_H |
18 | |
19 | #include "jrt.h" |
20 | |
21 | /** |
22 | * ECMAScript standard defines terms "code unit" and "character" as 16-bit unsigned value |
23 | * used to represent 16-bit unit of text, this is the same as code unit in UTF-16 (See ECMA-262 5.1 Chapter 6). |
24 | * |
25 | * The term "code point" or "Unicode character" is used to refer a single Unicode scalar value (may be longer |
26 | * than 16 bits: 0x0 - 0x10FFFFF). One code point could be represented with one ore two 16-bit code units. |
27 | * |
28 | * According to the standard all strings and source text are assumed to be a sequence of code units. |
29 | * Length of a string equals to number of code units in the string, which is not the same as number of Unicode |
30 | * characters in a string. |
31 | * |
32 | * Internally JerryScript engine uses UTF-8 representation of strings to reduce memory overhead. Unicode character |
33 | * occupies from one to four bytes in UTF-8 representation. |
34 | * |
35 | * Unicode scalar value | Bytes in UTF-8 | Bytes in UTF-16 |
36 | * | (internal representation) | |
37 | * ---------------------------------------------------------------------- |
38 | * 0x0 - 0x7F | 1 byte | 2 bytes |
39 | * 0x80 - 0x7FF | 2 bytes | 2 bytes |
40 | * 0x800 - 0xFFFF | 3 bytes | 2 bytes |
41 | * 0x10000 - 0x10FFFF | 4 bytes | 4 bytes |
42 | * |
43 | * Scalar values from 0xD800 to 0xDFFF are permanently reserved by Unicode standard to encode high and low |
44 | * surrogates in UTF-16 (Code points 0x10000 - 0x10FFFF are encoded via pair of surrogates in UTF-16). |
45 | * Despite that the official Unicode standard says that no UTF forms can encode these code points, we allow |
46 | * them to be encoded inside strings. The reason for that is compatibility with ECMA standard. |
47 | * |
48 | * For example, assume a string which consists one Unicode character: 0x1D700 (Mathematical Italic Small Epsilon). |
49 | * It has the following representation in UTF-16: 0xD835 0xDF00. |
50 | * |
51 | * ECMA standard allows extracting a substring from this string: |
52 | * > var str = String.fromCharCode (0xD835, 0xDF00); // Create a string containing one character: 0x1D700 |
53 | * > str.length; // 2 |
54 | * > var str1 = str.substring (0, 1); |
55 | * > str1.length; // 1 |
56 | * > str1.charCodeAt (0); // 55349 (this equals to 0xD835) |
57 | * |
58 | * Internally original string would be represented in UTF-8 as the following byte sequence: 0xF0 0x9D 0x9C 0x80. |
59 | * After substring extraction high surrogate 0xD835 should be encoded via UTF-8: 0xED 0xA0 0xB5. |
60 | * |
61 | * Pair of low and high surrogates encoded separately should never occur in internal string representation, |
62 | * it should be encoded as any code point and occupy 4 bytes. So, when constructing a string from two surrogates, |
63 | * it should be processed gracefully; |
64 | * > var str1 = String.fromCharCode (0xD835); // 0xED 0xA0 0xB5 - internal representation |
65 | * > var str2 = String.fromCharCode (0xDF00); // 0xED 0xBC 0x80 - internal representation |
66 | * > var str = str1 + str2; // 0xF0 0x9D 0x9C 0x80 - internal representation, |
67 | * // !!! not 0xED 0xA0 0xB5 0xED 0xBC 0x80 |
68 | */ |
69 | |
70 | /** |
71 | * Description of an ecma-character, which represents 16-bit code unit, |
72 | * which is equal to UTF-16 character (see Chapter 6 from ECMA-262 5.1) |
73 | */ |
74 | typedef uint16_t ecma_char_t; |
75 | |
76 | /** |
77 | * Max bytes needed to represent a code unit (utf-16 char) via utf-8 encoding |
78 | */ |
79 | #define LIT_UTF8_MAX_BYTES_IN_CODE_UNIT (3) |
80 | |
81 | /** |
82 | * Max bytes needed to represent a code point (Unicode character) via utf-8 encoding |
83 | */ |
84 | #define LIT_UTF8_MAX_BYTES_IN_CODE_POINT (4) |
85 | |
86 | /** |
87 | * Max bytes needed to represent a code unit (utf-16 char) via cesu-8 encoding |
88 | */ |
89 | #define LIT_CESU8_MAX_BYTES_IN_CODE_UNIT (3) |
90 | |
91 | /** |
92 | * Max bytes needed to represent a code point (Unicode character) via cesu-8 encoding |
93 | */ |
94 | #define LIT_CESU8_MAX_BYTES_IN_CODE_POINT (6) |
95 | |
96 | /** |
97 | * A byte of utf-8 string |
98 | */ |
99 | typedef uint8_t lit_utf8_byte_t; |
100 | |
101 | /** |
102 | * Size of a utf-8 string in bytes |
103 | */ |
104 | typedef uint32_t lit_utf8_size_t; |
105 | |
106 | /** |
107 | * Size of a magic string in bytes |
108 | */ |
109 | typedef uint8_t lit_magic_size_t; |
110 | |
111 | /** |
112 | * Unicode code point |
113 | */ |
114 | typedef uint32_t lit_code_point_t; |
115 | |
116 | /** |
117 | * ECMA string hash |
118 | */ |
119 | typedef uint32_t lit_string_hash_t; |
120 | |
121 | #endif /* !LIT_GLOBALS_H */ |
122 | |