1 | #include <Core/Defines.h> |
2 | |
3 | #include <Columns/ColumnString.h> |
4 | #include <Columns/ColumnsNumber.h> |
5 | #include <Columns/ColumnConst.h> |
6 | |
7 | #include <Common/typeid_cast.h> |
8 | #include <Common/assert_cast.h> |
9 | |
10 | #include <Core/Field.h> |
11 | |
12 | #include <Formats/FormatSettings.h> |
13 | #include <Formats/ProtobufReader.h> |
14 | #include <Formats/ProtobufWriter.h> |
15 | #include <DataTypes/DataTypeString.h> |
16 | #include <DataTypes/DataTypeFactory.h> |
17 | |
18 | #include <IO/ReadHelpers.h> |
19 | #include <IO/WriteHelpers.h> |
20 | #include <IO/VarInt.h> |
21 | |
22 | #ifdef __SSE2__ |
23 | #include <emmintrin.h> |
24 | #endif |
25 | |
26 | |
27 | namespace DB |
28 | { |
29 | |
30 | void DataTypeString::serializeBinary(const Field & field, WriteBuffer & ostr) const |
31 | { |
32 | const String & s = get<const String &>(field); |
33 | writeVarUInt(s.size(), ostr); |
34 | writeString(s, ostr); |
35 | } |
36 | |
37 | |
38 | void DataTypeString::deserializeBinary(Field & field, ReadBuffer & istr) const |
39 | { |
40 | UInt64 size; |
41 | readVarUInt(size, istr); |
42 | field = String(); |
43 | String & s = get<String &>(field); |
44 | s.resize(size); |
45 | istr.readStrict(s.data(), size); |
46 | } |
47 | |
48 | |
49 | void DataTypeString::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const |
50 | { |
51 | const StringRef & s = assert_cast<const ColumnString &>(column).getDataAt(row_num); |
52 | writeVarUInt(s.size, ostr); |
53 | writeString(s, ostr); |
54 | } |
55 | |
56 | |
57 | void DataTypeString::deserializeBinary(IColumn & column, ReadBuffer & istr) const |
58 | { |
59 | ColumnString & column_string = assert_cast<ColumnString &>(column); |
60 | ColumnString::Chars & data = column_string.getChars(); |
61 | ColumnString::Offsets & offsets = column_string.getOffsets(); |
62 | |
63 | UInt64 size; |
64 | readVarUInt(size, istr); |
65 | |
66 | size_t old_chars_size = data.size(); |
67 | size_t offset = old_chars_size + size + 1; |
68 | offsets.push_back(offset); |
69 | |
70 | try |
71 | { |
72 | data.resize(offset); |
73 | istr.readStrict(reinterpret_cast<char*>(&data[offset - size - 1]), size); |
74 | data.back() = 0; |
75 | } |
76 | catch (...) |
77 | { |
78 | offsets.pop_back(); |
79 | data.resize_assume_reserved(old_chars_size); |
80 | throw; |
81 | } |
82 | } |
83 | |
84 | |
85 | void DataTypeString::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const |
86 | { |
87 | const ColumnString & column_string = typeid_cast<const ColumnString &>(column); |
88 | const ColumnString::Chars & data = column_string.getChars(); |
89 | const ColumnString::Offsets & offsets = column_string.getOffsets(); |
90 | |
91 | size_t size = column.size(); |
92 | if (!size) |
93 | return; |
94 | |
95 | size_t end = limit && offset + limit < size |
96 | ? offset + limit |
97 | : size; |
98 | |
99 | if (offset == 0) |
100 | { |
101 | UInt64 str_size = offsets[0] - 1; |
102 | writeVarUInt(str_size, ostr); |
103 | ostr.write(reinterpret_cast<const char *>(data.data()), str_size); |
104 | |
105 | ++offset; |
106 | } |
107 | |
108 | for (size_t i = offset; i < end; ++i) |
109 | { |
110 | UInt64 str_size = offsets[i] - offsets[i - 1] - 1; |
111 | writeVarUInt(str_size, ostr); |
112 | ostr.write(reinterpret_cast<const char *>(&data[offsets[i - 1]]), str_size); |
113 | } |
114 | } |
115 | |
116 | |
117 | template <int UNROLL_TIMES> |
118 | static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars & data, ColumnString::Offsets & offsets, ReadBuffer & istr, size_t limit) |
119 | { |
120 | size_t offset = data.size(); |
121 | for (size_t i = 0; i < limit; ++i) |
122 | { |
123 | if (istr.eof()) |
124 | break; |
125 | |
126 | UInt64 size; |
127 | readVarUInt(size, istr); |
128 | |
129 | offset += size + 1; |
130 | offsets.push_back(offset); |
131 | |
132 | data.resize(offset); |
133 | |
134 | if (size) |
135 | { |
136 | #ifdef __SSE2__ |
137 | /// An optimistic branch in which more efficient copying is possible. |
138 | if (offset + 16 * UNROLL_TIMES <= data.capacity() && istr.position() + size + 16 * UNROLL_TIMES <= istr.buffer().end()) |
139 | { |
140 | const __m128i * sse_src_pos = reinterpret_cast<const __m128i *>(istr.position()); |
141 | const __m128i * sse_src_end = sse_src_pos + (size + (16 * UNROLL_TIMES - 1)) / 16 / UNROLL_TIMES * UNROLL_TIMES; |
142 | __m128i * sse_dst_pos = reinterpret_cast<__m128i *>(&data[offset - size - 1]); |
143 | |
144 | while (sse_src_pos < sse_src_end) |
145 | { |
146 | for (size_t j = 0; j < UNROLL_TIMES; ++j) |
147 | _mm_storeu_si128(sse_dst_pos + j, _mm_loadu_si128(sse_src_pos + j)); |
148 | |
149 | sse_src_pos += UNROLL_TIMES; |
150 | sse_dst_pos += UNROLL_TIMES; |
151 | } |
152 | |
153 | istr.position() += size; |
154 | } |
155 | else |
156 | #endif |
157 | { |
158 | istr.readStrict(reinterpret_cast<char*>(&data[offset - size - 1]), size); |
159 | } |
160 | } |
161 | |
162 | data[offset - 1] = 0; |
163 | } |
164 | } |
165 | |
166 | |
167 | void DataTypeString::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const |
168 | { |
169 | ColumnString & column_string = typeid_cast<ColumnString &>(column); |
170 | ColumnString::Chars & data = column_string.getChars(); |
171 | ColumnString::Offsets & offsets = column_string.getOffsets(); |
172 | |
173 | double avg_chars_size = 1; /// By default reserve only for empty strings. |
174 | |
175 | if (avg_value_size_hint && avg_value_size_hint > sizeof(offsets[0])) |
176 | { |
177 | /// Randomly selected. |
178 | constexpr auto avg_value_size_hint_reserve_multiplier = 1.2; |
179 | |
180 | avg_chars_size = (avg_value_size_hint - sizeof(offsets[0])) * avg_value_size_hint_reserve_multiplier; |
181 | } |
182 | |
183 | size_t size_to_reserve = data.size() + std::ceil(limit * avg_chars_size); |
184 | |
185 | /// Never reserve for too big size. |
186 | if (size_to_reserve < 256 * 1024 * 1024) |
187 | { |
188 | try |
189 | { |
190 | data.reserve(size_to_reserve); |
191 | } |
192 | catch (Exception & e) |
193 | { |
194 | e.addMessage( |
195 | "(avg_value_size_hint = " + toString(avg_value_size_hint) |
196 | + ", avg_chars_size = " + toString(avg_chars_size) |
197 | + ", limit = " + toString(limit) + ")" ); |
198 | throw; |
199 | } |
200 | } |
201 | |
202 | offsets.reserve(offsets.size() + limit); |
203 | |
204 | if (avg_chars_size >= 64) |
205 | deserializeBinarySSE2<4>(data, offsets, istr, limit); |
206 | else if (avg_chars_size >= 48) |
207 | deserializeBinarySSE2<3>(data, offsets, istr, limit); |
208 | else if (avg_chars_size >= 32) |
209 | deserializeBinarySSE2<2>(data, offsets, istr, limit); |
210 | else |
211 | deserializeBinarySSE2<1>(data, offsets, istr, limit); |
212 | } |
213 | |
214 | |
215 | void DataTypeString::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const |
216 | { |
217 | writeString(assert_cast<const ColumnString &>(column).getDataAt(row_num), ostr); |
218 | } |
219 | |
220 | |
221 | void DataTypeString::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const |
222 | { |
223 | writeEscapedString(assert_cast<const ColumnString &>(column).getDataAt(row_num), ostr); |
224 | } |
225 | |
226 | |
227 | template <typename Reader> |
228 | static inline void read(IColumn & column, Reader && reader) |
229 | { |
230 | ColumnString & column_string = assert_cast<ColumnString &>(column); |
231 | ColumnString::Chars & data = column_string.getChars(); |
232 | ColumnString::Offsets & offsets = column_string.getOffsets(); |
233 | size_t old_chars_size = data.size(); |
234 | size_t old_offsets_size = offsets.size(); |
235 | try |
236 | { |
237 | reader(data); |
238 | data.push_back(0); |
239 | offsets.push_back(data.size()); |
240 | } |
241 | catch (...) |
242 | { |
243 | offsets.resize_assume_reserved(old_offsets_size); |
244 | data.resize_assume_reserved(old_chars_size); |
245 | throw; |
246 | } |
247 | } |
248 | |
249 | |
250 | void DataTypeString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const |
251 | { |
252 | read(column, [&](ColumnString::Chars & data) { readStringInto(data, istr); }); |
253 | } |
254 | |
255 | |
256 | void DataTypeString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const |
257 | { |
258 | read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); }); |
259 | } |
260 | |
261 | |
262 | void DataTypeString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const |
263 | { |
264 | writeQuotedString(assert_cast<const ColumnString &>(column).getDataAt(row_num), ostr); |
265 | } |
266 | |
267 | |
268 | void DataTypeString::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const |
269 | { |
270 | read(column, [&](ColumnString::Chars & data) { readQuotedStringInto<true>(data, istr); }); |
271 | } |
272 | |
273 | |
274 | void DataTypeString::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const |
275 | { |
276 | writeJSONString(assert_cast<const ColumnString &>(column).getDataAt(row_num), ostr, settings); |
277 | } |
278 | |
279 | |
280 | void DataTypeString::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const |
281 | { |
282 | read(column, [&](ColumnString::Chars & data) { readJSONStringInto(data, istr); }); |
283 | } |
284 | |
285 | |
286 | void DataTypeString::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const |
287 | { |
288 | writeXMLString(assert_cast<const ColumnString &>(column).getDataAt(row_num), ostr); |
289 | } |
290 | |
291 | |
292 | void DataTypeString::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const |
293 | { |
294 | writeCSVString<>(assert_cast<const ColumnString &>(column).getDataAt(row_num), ostr); |
295 | } |
296 | |
297 | |
298 | void DataTypeString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const |
299 | { |
300 | read(column, [&](ColumnString::Chars & data) { readCSVStringInto(data, istr, settings.csv); }); |
301 | } |
302 | |
303 | |
304 | void DataTypeString::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const |
305 | { |
306 | if (value_index) |
307 | return; |
308 | value_index = static_cast<bool>(protobuf.writeString(assert_cast<const ColumnString &>(column).getDataAt(row_num))); |
309 | } |
310 | |
311 | |
312 | void DataTypeString::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const |
313 | { |
314 | row_added = false; |
315 | auto & column_string = assert_cast<ColumnString &>(column); |
316 | ColumnString::Chars & data = column_string.getChars(); |
317 | ColumnString::Offsets & offsets = column_string.getOffsets(); |
318 | size_t old_size = offsets.size(); |
319 | try |
320 | { |
321 | if (allow_add_row) |
322 | { |
323 | if (protobuf.readStringInto(data)) |
324 | { |
325 | data.emplace_back(0); |
326 | offsets.emplace_back(data.size()); |
327 | row_added = true; |
328 | } |
329 | else |
330 | data.resize_assume_reserved(offsets.back()); |
331 | } |
332 | else |
333 | { |
334 | ColumnString::Chars temp_data; |
335 | if (protobuf.readStringInto(temp_data)) |
336 | { |
337 | temp_data.emplace_back(0); |
338 | column_string.popBack(1); |
339 | old_size = offsets.size(); |
340 | data.insertSmallAllowReadWriteOverflow15(temp_data.begin(), temp_data.end()); |
341 | offsets.emplace_back(data.size()); |
342 | } |
343 | } |
344 | } |
345 | catch (...) |
346 | { |
347 | offsets.resize_assume_reserved(old_size); |
348 | data.resize_assume_reserved(offsets.back()); |
349 | throw; |
350 | } |
351 | } |
352 | |
353 | Field DataTypeString::getDefault() const |
354 | { |
355 | return String(); |
356 | } |
357 | |
358 | MutableColumnPtr DataTypeString::createColumn() const |
359 | { |
360 | return ColumnString::create(); |
361 | } |
362 | |
363 | |
364 | bool DataTypeString::equals(const IDataType & rhs) const |
365 | { |
366 | return typeid(rhs) == typeid(*this); |
367 | } |
368 | |
369 | |
370 | void registerDataTypeString(DataTypeFactory & factory) |
371 | { |
372 | const auto & creator = [&] (const String & type_name) { return std::make_shared<DataTypeString>(type_name); }; |
373 | |
374 | factory.registerSimpleDataType("String" , creator); |
375 | |
376 | /// These synonyms are added for compatibility. |
377 | |
378 | factory.registerAlias("CHAR" , "String" , DataTypeFactory::CaseInsensitive); |
379 | factory.registerAlias("VARCHAR" , "String" , DataTypeFactory::CaseInsensitive); |
380 | factory.registerAlias("TEXT" , "String" , DataTypeFactory::CaseInsensitive); |
381 | factory.registerAlias("TINYTEXT" , "String" , DataTypeFactory::CaseInsensitive); |
382 | factory.registerAlias("MEDIUMTEXT" , "String" , DataTypeFactory::CaseInsensitive); |
383 | factory.registerAlias("LONGTEXT" , "String" , DataTypeFactory::CaseInsensitive); |
384 | factory.registerAlias("BLOB" , "String" , DataTypeFactory::CaseInsensitive); |
385 | factory.registerAlias("TINYBLOB" , "String" , DataTypeFactory::CaseInsensitive); |
386 | factory.registerAlias("MEDIUMBLOB" , "String" , DataTypeFactory::CaseInsensitive); |
387 | factory.registerAlias("LONGBLOB" , "String" , DataTypeFactory::CaseInsensitive); |
388 | } |
389 | |
390 | } |
391 | |