1 | /* Copyright JS Foundation and other contributors, http://js.foundation |
2 | * |
3 | * Licensed under the Apache License, Version 2.0 (the "License"); |
4 | * you may not use this file except in compliance with the License. |
5 | * You may obtain a copy of the License at |
6 | * |
7 | * http://www.apache.org/licenses/LICENSE-2.0 |
8 | * |
9 | * Unless required by applicable law or agreed to in writing, software |
10 | * distributed under the License is distributed on an "AS IS" BASIS |
11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | * See the License for the specific language governing permissions and |
13 | * limitations under the License. |
14 | */ |
15 | |
16 | #ifndef LIT_STRINGS_H |
17 | #define LIT_STRINGS_H |
18 | |
19 | #include "jrt.h" |
20 | #include "lit-globals.h" |
21 | |
22 | /** |
23 | * Null character (used in few cases as utf-8 string end marker) |
24 | */ |
25 | #define LIT_BYTE_NULL (0) |
26 | |
27 | /** |
28 | * For the formal definition of Unicode transformation formats (UTF) see Section 3.9, Unicode Encoding Forms in The |
29 | * Unicode Standard (http://www.unicode.org/versions/Unicode3.0.0/ch03.pdf#G7404). |
30 | */ |
31 | #define LIT_UNICODE_CODE_POINT_NULL (0x0) |
32 | #define LIT_UNICODE_CODE_POINT_MAX (0x10FFFF) |
33 | |
34 | #define LIT_UTF16_CODE_UNIT_MAX (0xFFFF) |
35 | #define LIT_UTF16_FIRST_SURROGATE_CODE_POINT (0x10000) |
36 | #define LIT_UTF16_LOW_SURROGATE_MARKER (0xDC00) |
37 | #define LIT_UTF16_HIGH_SURROGATE_MARKER (0xD800) |
38 | #define LIT_UTF16_HIGH_SURROGATE_MIN (0xD800) |
39 | #define LIT_UTF16_HIGH_SURROGATE_MAX (0xDBFF) |
40 | #define LIT_UTF16_LOW_SURROGATE_MIN (0xDC00) |
41 | #define LIT_UTF16_LOW_SURROGATE_MAX (0xDFFF) |
42 | #define LIT_UTF16_BITS_IN_SURROGATE (10) |
43 | #define LIT_UTF16_LAST_10_BITS_MASK (0x3FF) |
44 | |
45 | #define LIT_UTF8_1_BYTE_MARKER (0x00) |
46 | #define LIT_UTF8_2_BYTE_MARKER (0xC0) |
47 | #define LIT_UTF8_3_BYTE_MARKER (0xE0) |
48 | #define LIT_UTF8_4_BYTE_MARKER (0xF0) |
49 | #define (0x80) |
50 | |
51 | #define LIT_UTF8_1_BYTE_MASK (0x80) |
52 | #define LIT_UTF8_2_BYTE_MASK (0xE0) |
53 | #define LIT_UTF8_3_BYTE_MASK (0xF0) |
54 | #define LIT_UTF8_4_BYTE_MASK (0xF8) |
55 | #define (0xC0) |
56 | |
57 | #define LIT_UTF8_LAST_7_BITS_MASK (0x7F) |
58 | #define LIT_UTF8_LAST_6_BITS_MASK (0x3F) |
59 | #define LIT_UTF8_LAST_5_BITS_MASK (0x1F) |
60 | #define LIT_UTF8_LAST_4_BITS_MASK (0x0F) |
61 | #define LIT_UTF8_LAST_3_BITS_MASK (0x07) |
62 | #define LIT_UTF8_LAST_2_BITS_MASK (0x03) |
63 | #define LIT_UTF8_LAST_1_BIT_MASK (0x01) |
64 | |
65 | #define (6) |
66 | |
67 | #define LIT_UTF8_1_BYTE_CODE_POINT_MAX (0x7F) |
68 | #define LIT_UTF8_2_BYTE_CODE_POINT_MIN (0x80) |
69 | #define LIT_UTF8_2_BYTE_CODE_POINT_MAX (0x7FF) |
70 | #define LIT_UTF8_3_BYTE_CODE_POINT_MIN (0x800) |
71 | #define LIT_UTF8_3_BYTE_CODE_POINT_MAX (LIT_UTF16_CODE_UNIT_MAX) |
72 | #define LIT_UTF8_4_BYTE_CODE_POINT_MIN (0x10000) |
73 | #define LIT_UTF8_4_BYTE_CODE_POINT_MAX (LIT_UNICODE_CODE_POINT_MAX) |
74 | |
75 | /** |
76 | * Differnce between byte count needed to represent code point greater than 0xFFFF |
77 | * in common UTF-8 (4 bytes required) and CESU-8 (6 bytes required) |
78 | */ |
79 | #define LIT_UTF8_CESU8_SURROGATE_SIZE_DIF (2 * LIT_UTF8_MAX_BYTES_IN_CODE_UNIT - LIT_UTF8_MAX_BYTES_IN_CODE_POINT) |
80 | |
81 | /** |
82 | * Byte values >= LIT_UTF8_FIRST_BYTE_MAX are not allowed in internal strings |
83 | */ |
84 | #define LIT_UTF8_FIRST_BYTE_MAX (0xF8) |
85 | |
86 | /* validation */ |
87 | bool lit_is_valid_utf8_string (const lit_utf8_byte_t *utf8_buf_p, lit_utf8_size_t buf_size); |
88 | bool lit_is_valid_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, lit_utf8_size_t buf_size); |
89 | |
90 | /* checks */ |
91 | bool lit_is_code_point_utf16_low_surrogate (lit_code_point_t code_point); |
92 | bool lit_is_code_point_utf16_high_surrogate (lit_code_point_t code_point); |
93 | |
94 | /* size */ |
95 | lit_utf8_size_t lit_zt_utf8_string_size (const lit_utf8_byte_t *utf8_str_p); |
96 | lit_utf8_size_t lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, lit_utf8_size_t cesu8_buf_size); |
97 | |
98 | /* length */ |
99 | lit_utf8_size_t lit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, lit_utf8_size_t utf8_buf_size); |
100 | lit_utf8_size_t lit_get_utf8_length_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, |
101 | lit_utf8_size_t cesu8_buf_size); |
102 | |
103 | /* hash */ |
104 | lit_string_hash_t lit_utf8_string_calc_hash (const lit_utf8_byte_t *utf8_buf_p, lit_utf8_size_t utf8_buf_size); |
105 | lit_string_hash_t lit_utf8_string_hash_combine (lit_string_hash_t hash_basis, const lit_utf8_byte_t *utf8_buf_p, |
106 | lit_utf8_size_t utf8_buf_size); |
107 | |
108 | /* code unit access */ |
109 | ecma_char_t lit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, lit_utf8_size_t utf8_buf_size, |
110 | lit_utf8_size_t code_unit_offset); |
111 | lit_utf8_size_t lit_get_unicode_char_size_by_utf8_first_byte (const lit_utf8_byte_t first_byte); |
112 | |
113 | /* conversion */ |
114 | lit_utf8_size_t lit_code_unit_to_utf8 (ecma_char_t code_unit, lit_utf8_byte_t *buf_p); |
115 | lit_utf8_size_t lit_code_point_to_utf8 (lit_code_point_t code_point, lit_utf8_byte_t *buf); |
116 | lit_utf8_size_t lit_code_point_to_cesu8 (lit_code_point_t code_point, lit_utf8_byte_t *buf); |
117 | lit_utf8_size_t lit_convert_cesu8_string_to_utf8_string (const lit_utf8_byte_t *cesu8_string, |
118 | lit_utf8_size_t cesu8_size, |
119 | lit_utf8_byte_t *utf8_string, |
120 | lit_utf8_size_t utf8_size); |
121 | lit_code_point_t lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, ecma_char_t low_surrogate); |
122 | |
123 | bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, lit_utf8_size_t string1_size, |
124 | const lit_utf8_byte_t *string2_p, lit_utf8_size_t string2_size); |
125 | |
126 | uint8_t lit_utf16_encode_code_point (lit_code_point_t cp, ecma_char_t *cu_p); |
127 | |
128 | /* read code point from buffer */ |
129 | lit_utf8_size_t lit_read_code_point_from_utf8 (const lit_utf8_byte_t *buf_p, lit_utf8_size_t buf_size, |
130 | lit_code_point_t *code_point); |
131 | |
132 | lit_utf8_size_t lit_read_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, |
133 | ecma_char_t *code_point); |
134 | |
135 | lit_utf8_size_t lit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, |
136 | ecma_char_t *code_point); |
137 | |
138 | ecma_char_t lit_cesu8_read_next (const lit_utf8_byte_t **buf_p); |
139 | ecma_char_t lit_cesu8_read_prev (const lit_utf8_byte_t **buf_p); |
140 | ecma_char_t lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p); |
141 | ecma_char_t lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p); |
142 | void lit_utf8_incr (const lit_utf8_byte_t **buf_p); |
143 | void lit_utf8_decr (const lit_utf8_byte_t **buf_p); |
144 | |
145 | #endif /* !LIT_STRINGS_H */ |
146 | |