1 | // Tencent is pleased to support the open source community by making RapidJSON available. |
2 | // |
3 | // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. |
4 | // |
5 | // Licensed under the MIT License (the "License"); you may not use this file except |
6 | // in compliance with the License. You may obtain a copy of the License at |
7 | // |
8 | // http://opensource.org/licenses/MIT |
9 | // |
10 | // Unless required by applicable law or agreed to in writing, software distributed |
11 | // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR |
12 | // CONDITIONS OF ANY KIND, either express or implied. See the License for the |
13 | // specific language governing permissions and limitations under the License. |
14 | |
15 | #ifndef RAPIDJSON_ENCODEDSTREAM_H_ |
16 | #define RAPIDJSON_ENCODEDSTREAM_H_ |
17 | |
18 | #include "stream.h" |
19 | #include "memorystream.h" |
20 | |
21 | #ifdef __GNUC__ |
22 | RAPIDJSON_DIAG_PUSH |
23 | RAPIDJSON_DIAG_OFF(effc++) |
24 | #endif |
25 | |
26 | #ifdef __clang__ |
27 | RAPIDJSON_DIAG_PUSH |
28 | RAPIDJSON_DIAG_OFF(padded) |
29 | #endif |
30 | |
31 | RAPIDJSON_NAMESPACE_BEGIN |
32 | |
33 | //! Input byte stream wrapper with a statically bound encoding. |
34 | /*! |
35 | \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE. |
36 | \tparam InputByteStream Type of input byte stream. For example, FileReadStream. |
37 | */ |
38 | template <typename Encoding, typename InputByteStream> |
39 | class EncodedInputStream { |
40 | RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); |
41 | public: |
42 | typedef typename Encoding::Ch Ch; |
43 | |
44 | EncodedInputStream(InputByteStream& is) : is_(is) { |
45 | current_ = Encoding::TakeBOM(is_); |
46 | } |
47 | |
48 | Ch Peek() const { return current_; } |
49 | Ch Take() { Ch c = current_; current_ = Encoding::Take(is_); return c; } |
50 | size_t Tell() const { return is_.Tell(); } |
51 | |
52 | // Not implemented |
53 | void Put(Ch) { RAPIDJSON_ASSERT(false); } |
54 | void Flush() { RAPIDJSON_ASSERT(false); } |
55 | Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } |
56 | size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } |
57 | |
58 | private: |
59 | EncodedInputStream(const EncodedInputStream&); |
60 | EncodedInputStream& operator=(const EncodedInputStream&); |
61 | |
62 | InputByteStream& is_; |
63 | Ch current_; |
64 | }; |
65 | |
66 | //! Specialized for UTF8 MemoryStream. |
67 | template <> |
68 | class EncodedInputStream<UTF8<>, MemoryStream> { |
69 | public: |
70 | typedef UTF8<>::Ch Ch; |
71 | |
72 | EncodedInputStream(MemoryStream& is) : is_(is) { |
73 | if (static_cast<unsigned char>(is_.Peek()) == 0xEFu) is_.Take(); |
74 | if (static_cast<unsigned char>(is_.Peek()) == 0xBBu) is_.Take(); |
75 | if (static_cast<unsigned char>(is_.Peek()) == 0xBFu) is_.Take(); |
76 | } |
77 | Ch Peek() const { return is_.Peek(); } |
78 | Ch Take() { return is_.Take(); } |
79 | size_t Tell() const { return is_.Tell(); } |
80 | |
81 | // Not implemented |
82 | void Put(Ch) {} |
83 | void Flush() {} |
84 | Ch* PutBegin() { return 0; } |
85 | size_t PutEnd(Ch*) { return 0; } |
86 | |
87 | MemoryStream& is_; |
88 | |
89 | private: |
90 | EncodedInputStream(const EncodedInputStream&); |
91 | EncodedInputStream& operator=(const EncodedInputStream&); |
92 | }; |
93 | |
94 | //! Output byte stream wrapper with statically bound encoding. |
95 | /*! |
96 | \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE. |
97 | \tparam OutputByteStream Type of input byte stream. For example, FileWriteStream. |
98 | */ |
99 | template <typename Encoding, typename OutputByteStream> |
100 | class EncodedOutputStream { |
101 | RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); |
102 | public: |
103 | typedef typename Encoding::Ch Ch; |
104 | |
105 | EncodedOutputStream(OutputByteStream& os, bool putBOM = true) : os_(os) { |
106 | if (putBOM) |
107 | Encoding::PutBOM(os_); |
108 | } |
109 | |
110 | void Put(Ch c) { Encoding::Put(os_, c); } |
111 | void Flush() { os_.Flush(); } |
112 | |
113 | // Not implemented |
114 | Ch Peek() const { RAPIDJSON_ASSERT(false); return 0;} |
115 | Ch Take() { RAPIDJSON_ASSERT(false); return 0;} |
116 | size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; } |
117 | Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } |
118 | size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } |
119 | |
120 | private: |
121 | EncodedOutputStream(const EncodedOutputStream&); |
122 | EncodedOutputStream& operator=(const EncodedOutputStream&); |
123 | |
124 | OutputByteStream& os_; |
125 | }; |
126 | |
127 | #define RAPIDJSON_ENCODINGS_FUNC(x) UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x |
128 | |
129 | //! Input stream wrapper with dynamically bound encoding and automatic encoding detection. |
130 | /*! |
131 | \tparam CharType Type of character for reading. |
132 | \tparam InputByteStream type of input byte stream to be wrapped. |
133 | */ |
134 | template <typename CharType, typename InputByteStream> |
135 | class AutoUTFInputStream { |
136 | RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1); |
137 | public: |
138 | typedef CharType Ch; |
139 | |
140 | //! Constructor. |
141 | /*! |
142 | \param is input stream to be wrapped. |
143 | \param type UTF encoding type if it is not detected from the stream. |
144 | */ |
145 | AutoUTFInputStream(InputByteStream& is, UTFType type = kUTF8) : is_(&is), type_(type), hasBOM_(false) { |
146 | RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE); |
147 | DetectType(); |
148 | static const TakeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Take) }; |
149 | takeFunc_ = f[type_]; |
150 | current_ = takeFunc_(*is_); |
151 | } |
152 | |
153 | UTFType GetType() const { return type_; } |
154 | bool HasBOM() const { return hasBOM_; } |
155 | |
156 | Ch Peek() const { return current_; } |
157 | Ch Take() { Ch c = current_; current_ = takeFunc_(*is_); return c; } |
158 | size_t Tell() const { return is_->Tell(); } |
159 | |
160 | // Not implemented |
161 | void Put(Ch) { RAPIDJSON_ASSERT(false); } |
162 | void Flush() { RAPIDJSON_ASSERT(false); } |
163 | Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } |
164 | size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } |
165 | |
166 | private: |
167 | AutoUTFInputStream(const AutoUTFInputStream&); |
168 | AutoUTFInputStream& operator=(const AutoUTFInputStream&); |
169 | |
170 | // Detect encoding type with BOM or RFC 4627 |
171 | void DetectType() { |
172 | // BOM (Byte Order Mark): |
173 | // 00 00 FE FF UTF-32BE |
174 | // FF FE 00 00 UTF-32LE |
175 | // FE FF UTF-16BE |
176 | // FF FE UTF-16LE |
177 | // EF BB BF UTF-8 |
178 | |
179 | const unsigned char* c = reinterpret_cast<const unsigned char *>(is_->Peek4()); |
180 | if (!c) |
181 | return; |
182 | |
183 | unsigned bom = static_cast<unsigned>(c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24)); |
184 | hasBOM_ = false; |
185 | if (bom == 0xFFFE0000) { type_ = kUTF32BE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); } |
186 | else if (bom == 0x0000FEFF) { type_ = kUTF32LE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); } |
187 | else if ((bom & 0xFFFF) == 0xFFFE) { type_ = kUTF16BE; hasBOM_ = true; is_->Take(); is_->Take(); } |
188 | else if ((bom & 0xFFFF) == 0xFEFF) { type_ = kUTF16LE; hasBOM_ = true; is_->Take(); is_->Take(); } |
189 | else if ((bom & 0xFFFFFF) == 0xBFBBEF) { type_ = kUTF8; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); } |
190 | |
191 | // RFC 4627: Section 3 |
192 | // "Since the first two characters of a JSON text will always be ASCII |
193 | // characters [RFC0020], it is possible to determine whether an octet |
194 | // stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking |
195 | // at the pattern of nulls in the first four octets." |
196 | // 00 00 00 xx UTF-32BE |
197 | // 00 xx 00 xx UTF-16BE |
198 | // xx 00 00 00 UTF-32LE |
199 | // xx 00 xx 00 UTF-16LE |
200 | // xx xx xx xx UTF-8 |
201 | |
202 | if (!hasBOM_) { |
203 | unsigned pattern = (c[0] ? 1 : 0) | (c[1] ? 2 : 0) | (c[2] ? 4 : 0) | (c[3] ? 8 : 0); |
204 | switch (pattern) { |
205 | case 0x08: type_ = kUTF32BE; break; |
206 | case 0x0A: type_ = kUTF16BE; break; |
207 | case 0x01: type_ = kUTF32LE; break; |
208 | case 0x05: type_ = kUTF16LE; break; |
209 | case 0x0F: type_ = kUTF8; break; |
210 | default: break; // Use type defined by user. |
211 | } |
212 | } |
213 | |
214 | // Runtime check whether the size of character type is sufficient. It only perform checks with assertion. |
215 | if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2); |
216 | if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4); |
217 | } |
218 | |
219 | typedef Ch (*TakeFunc)(InputByteStream& is); |
220 | InputByteStream* is_; |
221 | UTFType type_; |
222 | Ch current_; |
223 | TakeFunc takeFunc_; |
224 | bool hasBOM_; |
225 | }; |
226 | |
227 | //! Output stream wrapper with dynamically bound encoding and automatic encoding detection. |
228 | /*! |
229 | \tparam CharType Type of character for writing. |
230 | \tparam OutputByteStream type of output byte stream to be wrapped. |
231 | */ |
232 | template <typename CharType, typename OutputByteStream> |
233 | class AutoUTFOutputStream { |
234 | RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1); |
235 | public: |
236 | typedef CharType Ch; |
237 | |
238 | //! Constructor. |
239 | /*! |
240 | \param os output stream to be wrapped. |
241 | \param type UTF encoding type. |
242 | \param putBOM Whether to write BOM at the beginning of the stream. |
243 | */ |
244 | AutoUTFOutputStream(OutputByteStream& os, UTFType type, bool putBOM) : os_(&os), type_(type) { |
245 | RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE); |
246 | |
247 | // Runtime check whether the size of character type is sufficient. It only perform checks with assertion. |
248 | if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2); |
249 | if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4); |
250 | |
251 | static const PutFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Put) }; |
252 | putFunc_ = f[type_]; |
253 | |
254 | if (putBOM) |
255 | PutBOM(); |
256 | } |
257 | |
258 | UTFType GetType() const { return type_; } |
259 | |
260 | void Put(Ch c) { putFunc_(*os_, c); } |
261 | void Flush() { os_->Flush(); } |
262 | |
263 | // Not implemented |
264 | Ch Peek() const { RAPIDJSON_ASSERT(false); return 0;} |
265 | Ch Take() { RAPIDJSON_ASSERT(false); return 0;} |
266 | size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; } |
267 | Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } |
268 | size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; } |
269 | |
270 | private: |
271 | AutoUTFOutputStream(const AutoUTFOutputStream&); |
272 | AutoUTFOutputStream& operator=(const AutoUTFOutputStream&); |
273 | |
274 | void PutBOM() { |
275 | typedef void (*PutBOMFunc)(OutputByteStream&); |
276 | static const PutBOMFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(PutBOM) }; |
277 | f[type_](*os_); |
278 | } |
279 | |
280 | typedef void (*PutFunc)(OutputByteStream&, Ch); |
281 | |
282 | OutputByteStream* os_; |
283 | UTFType type_; |
284 | PutFunc putFunc_; |
285 | }; |
286 | |
287 | #undef RAPIDJSON_ENCODINGS_FUNC |
288 | |
289 | RAPIDJSON_NAMESPACE_END |
290 | |
291 | #ifdef __clang__ |
292 | RAPIDJSON_DIAG_POP |
293 | #endif |
294 | |
295 | #ifdef __GNUC__ |
296 | RAPIDJSON_DIAG_POP |
297 | #endif |
298 | |
299 | #endif // RAPIDJSON_FILESTREAM_H_ |
300 | |