1 | // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
2 | // for details. All rights reserved. Use of this source code is governed by a |
3 | // BSD-style license that can be found in the LICENSE file. |
4 | |
5 | #ifndef RUNTIME_PLATFORM_UNICODE_H_ |
6 | #define RUNTIME_PLATFORM_UNICODE_H_ |
7 | |
8 | #include "platform/allocation.h" |
9 | #include "platform/globals.h" |
10 | #include "platform/unaligned.h" |
11 | |
12 | namespace dart { |
13 | |
14 | class String; |
15 | |
16 | class Utf : AllStatic { |
17 | public: |
18 | static const int32_t kMaxCodePoint = 0x10FFFF; |
19 | static const int32_t kInvalidChar = 0xFFFFFFFF; |
20 | |
21 | static const int32_t kReplacementChar = 0xFFFD; |
22 | |
23 | static bool IsLatin1(int32_t code_point) { |
24 | return (code_point >= 0) && (code_point <= 0xFF); |
25 | } |
26 | |
27 | static bool IsBmp(int32_t code_point) { |
28 | return (code_point >= 0) && (code_point <= 0xFFFF); |
29 | } |
30 | |
31 | static bool IsSupplementary(int32_t code_point) { |
32 | return (code_point > 0xFFFF) && (code_point <= kMaxCodePoint); |
33 | } |
34 | |
35 | // Returns true if the code point value is above Plane 17. |
36 | static bool IsOutOfRange(int32_t code_point) { |
37 | return (code_point < 0) || (code_point > kMaxCodePoint); |
38 | } |
39 | }; |
40 | |
41 | class Utf8 : AllStatic { |
42 | public: |
43 | enum Type { |
44 | kLatin1 = 0, // Latin-1 code point [U+0000, U+00FF]. |
45 | kBMP, // Basic Multilingual Plane code point [U+0000, U+FFFF]. |
46 | kSupplementary, // Supplementary code point [U+010000, U+10FFFF]. |
47 | }; |
48 | |
49 | // Returns the most restricted coding form in which the sequence of utf8 |
50 | // characters in 'utf8_array' can be represented in, and the number of |
51 | // code units needed in that form. |
52 | static intptr_t CodeUnitCount(const uint8_t* utf8_array, |
53 | intptr_t array_len, |
54 | Type* type); |
55 | |
56 | // Returns true if 'utf8_array' is a valid UTF-8 string. |
57 | static bool IsValid(const uint8_t* utf8_array, intptr_t array_len); |
58 | |
59 | static intptr_t Length(int32_t ch); |
60 | static intptr_t Length(const String& str); |
61 | |
62 | static intptr_t Encode(int32_t ch, char* dst); |
63 | static intptr_t Encode(const String& src, char* dst, intptr_t len); |
64 | |
65 | static intptr_t Decode(const uint8_t* utf8_array, |
66 | intptr_t array_len, |
67 | int32_t* ch); |
68 | |
69 | static bool DecodeToLatin1(const uint8_t* utf8_array, |
70 | intptr_t array_len, |
71 | uint8_t* dst, |
72 | intptr_t len); |
73 | static bool DecodeToUTF16(const uint8_t* utf8_array, |
74 | intptr_t array_len, |
75 | uint16_t* dst, |
76 | intptr_t len); |
77 | static bool DecodeToUTF32(const uint8_t* utf8_array, |
78 | intptr_t array_len, |
79 | int32_t* dst, |
80 | intptr_t len); |
81 | static intptr_t ReportInvalidByte(const uint8_t* utf8_array, |
82 | intptr_t array_len, |
83 | intptr_t len); |
84 | static bool DecodeCStringToUTF32(const char* str, int32_t* dst, intptr_t len); |
85 | |
86 | static const int32_t kMaxOneByteChar = 0x7F; |
87 | static const int32_t kMaxTwoByteChar = 0x7FF; |
88 | static const int32_t kMaxThreeByteChar = 0xFFFF; |
89 | static const int32_t kMaxFourByteChar = Utf::kMaxCodePoint; |
90 | |
91 | private: |
92 | static bool IsTrailByte(uint8_t code_unit) { |
93 | return (code_unit & 0xC0) == 0x80; |
94 | } |
95 | |
96 | static bool IsNonShortestForm(uint32_t code_point, size_t num_code_units) { |
97 | return code_point < kOverlongMinimum[num_code_units]; |
98 | } |
99 | |
100 | static bool IsLatin1SequenceStart(uint8_t code_unit) { |
101 | // Check if utf8 sequence is the start of a codepoint <= U+00FF |
102 | return (code_unit <= 0xC3); |
103 | } |
104 | |
105 | static bool IsSupplementarySequenceStart(uint8_t code_unit) { |
106 | // Check if utf8 sequence is the start of a codepoint >= U+10000. |
107 | return (code_unit >= 0xF0); |
108 | } |
109 | |
110 | static const int8_t kTrailBytes[]; |
111 | static const uint32_t kMagicBits[]; |
112 | static const uint32_t kOverlongMinimum[]; |
113 | }; |
114 | |
115 | class Utf16 : AllStatic { |
116 | public: |
117 | // Returns the length of the code point in UTF-16 code units. |
118 | static intptr_t Length(int32_t ch) { |
119 | return (ch <= Utf16::kMaxCodeUnit) ? 1 : 2; |
120 | } |
121 | |
122 | // Returns true if ch is a lead or trail surrogate. |
123 | static bool IsSurrogate(uint32_t ch) { return (ch & 0xFFFFF800) == 0xD800; } |
124 | |
125 | // Returns true if ch is a lead surrogate. |
126 | static bool IsLeadSurrogate(uint32_t ch) { |
127 | return (ch & 0xFFFFFC00) == 0xD800; |
128 | } |
129 | |
130 | // Returns true if ch is a low surrogate. |
131 | static bool IsTrailSurrogate(uint32_t ch) { |
132 | return (ch & 0xFFFFFC00) == 0xDC00; |
133 | } |
134 | |
135 | // Returns the character at i and advances i to the next character |
136 | // boundary. |
137 | static int32_t Next(const uint16_t* characters, intptr_t* i, intptr_t len) { |
138 | int32_t ch = LoadUnaligned(&characters[*i]); |
139 | if (Utf16::IsLeadSurrogate(ch) && (*i < (len - 1))) { |
140 | int32_t ch2 = LoadUnaligned(&characters[*i + 1]); |
141 | if (Utf16::IsTrailSurrogate(ch2)) { |
142 | ch = Utf16::Decode(ch, ch2); |
143 | *i += 1; |
144 | } |
145 | } |
146 | *i += 1; |
147 | return ch; |
148 | } |
149 | |
150 | // Decodes a surrogate pair into a supplementary code point. |
151 | static int32_t Decode(uint16_t lead, uint16_t trail) { |
152 | return 0x10000 + ((lead & 0x000003FF) << 10) + (trail & 0x3FF); |
153 | } |
154 | |
155 | // Encodes a single code point. |
156 | static void Encode(int32_t codepoint, uint16_t* dst); |
157 | |
158 | static const int32_t kMaxCodeUnit = 0xFFFF; |
159 | static const int32_t kLeadSurrogateStart = 0xD800; |
160 | static const int32_t kLeadSurrogateEnd = 0xDBFF; |
161 | static const int32_t kTrailSurrogateStart = 0xDC00; |
162 | static const int32_t kTrailSurrogateEnd = 0xDFFF; |
163 | |
164 | private: |
165 | static const int32_t kLeadSurrogateOffset = (0xD800 - (0x10000 >> 10)); |
166 | |
167 | static const int32_t kSurrogateOffset = (0x10000 - (0xD800 << 10) - 0xDC00); |
168 | }; |
169 | |
170 | class CaseMapping : AllStatic { |
171 | public: |
172 | // Maps a code point to uppercase. |
173 | static int32_t ToUpper(int32_t code_point) { |
174 | return Convert(code_point, kUppercase); |
175 | } |
176 | |
177 | // Maps a code point to lowercase. |
178 | static int32_t ToLower(int32_t code_point) { |
179 | return Convert(code_point, kLowercase); |
180 | } |
181 | |
182 | private: |
183 | // Property is a delta to the uppercase mapping. |
184 | static const int32_t kUppercase = 1; |
185 | |
186 | // Property is a delta to the uppercase mapping. |
187 | static const int32_t kLowercase = 2; |
188 | |
189 | // Property is an index into the exception table. |
190 | static const int32_t kException = 3; |
191 | |
192 | // Type bit-field parameters |
193 | static const int32_t kTypeShift = 2; |
194 | static const int32_t kTypeMask = 3; |
195 | |
196 | // The size of the stage 1 index. |
197 | // TODO(cshapiro): improve indexing so this value is unnecessary. |
198 | static const intptr_t kStage1Size = 261; |
199 | |
200 | // The size of a stage 2 block in bytes. |
201 | static const intptr_t kBlockSizeLog2 = 8; |
202 | static const intptr_t kBlockSize = 1 << kBlockSizeLog2; |
203 | |
204 | static int32_t Convert(int32_t ch, int32_t mapping) { |
205 | if (Utf::IsLatin1(ch)) { |
206 | int32_t info = stage2_[ch]; |
207 | if ((info & kTypeMask) == mapping) { |
208 | ch += info >> kTypeShift; |
209 | } |
210 | } else if (ch <= (kStage1Size << kBlockSizeLog2)) { |
211 | int16_t offset = stage1_[ch >> kBlockSizeLog2] << kBlockSizeLog2; |
212 | int32_t info = stage2_[offset + (ch & (kBlockSize - 1))]; |
213 | int32_t type = info & kTypeMask; |
214 | if (type == mapping) { |
215 | ch += (info >> kTypeShift); |
216 | } else if (type == kException) { |
217 | ch += stage2_exception_[info >> kTypeShift][mapping - 1]; |
218 | } |
219 | } |
220 | return ch; |
221 | } |
222 | |
223 | // Index into the data array. |
224 | static const uint8_t stage1_[]; |
225 | |
226 | // Data for small code points with one mapping |
227 | static const int16_t stage2_[]; |
228 | |
229 | // Data for large code points or code points with both mappings. |
230 | static const int32_t stage2_exception_[][2]; |
231 | }; |
232 | |
233 | class Latin1 { |
234 | public: |
235 | static const int32_t kMaxChar = 0xff; |
236 | // Convert the character to Latin-1 case equivalent if possible. |
237 | static inline uint16_t TryConvertToLatin1(uint16_t c) { |
238 | switch (c) { |
239 | // This are equivalent characters in unicode. |
240 | case 0x39c: |
241 | case 0x3bc: |
242 | return 0xb5; |
243 | // This is an uppercase of a Latin-1 character |
244 | // outside of Latin-1. |
245 | case 0x178: |
246 | return 0xff; |
247 | } |
248 | return c; |
249 | } |
250 | }; |
251 | |
252 | } // namespace dart |
253 | |
254 | #endif // RUNTIME_PLATFORM_UNICODE_H_ |
255 | |