1#include <Core/Defines.h>
2
3#include <Columns/ColumnString.h>
4#include <Columns/ColumnsNumber.h>
5#include <Columns/ColumnConst.h>
6
7#include <Common/typeid_cast.h>
8#include <Common/assert_cast.h>
9
10#include <Core/Field.h>
11
12#include <Formats/FormatSettings.h>
13#include <Formats/ProtobufReader.h>
14#include <Formats/ProtobufWriter.h>
15#include <DataTypes/DataTypeString.h>
16#include <DataTypes/DataTypeFactory.h>
17
18#include <IO/ReadHelpers.h>
19#include <IO/WriteHelpers.h>
20#include <IO/VarInt.h>
21
22#ifdef __SSE2__
23 #include <emmintrin.h>
24#endif
25
26
27namespace DB
28{
29
30void DataTypeString::serializeBinary(const Field & field, WriteBuffer & ostr) const
31{
32 const String & s = get<const String &>(field);
33 writeVarUInt(s.size(), ostr);
34 writeString(s, ostr);
35}
36
37
38void DataTypeString::deserializeBinary(Field & field, ReadBuffer & istr) const
39{
40 UInt64 size;
41 readVarUInt(size, istr);
42 field = String();
43 String & s = get<String &>(field);
44 s.resize(size);
45 istr.readStrict(s.data(), size);
46}
47
48
49void DataTypeString::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const
50{
51 const StringRef & s = assert_cast<const ColumnString &>(column).getDataAt(row_num);
52 writeVarUInt(s.size, ostr);
53 writeString(s, ostr);
54}
55
56
57void DataTypeString::deserializeBinary(IColumn & column, ReadBuffer & istr) const
58{
59 ColumnString & column_string = assert_cast<ColumnString &>(column);
60 ColumnString::Chars & data = column_string.getChars();
61 ColumnString::Offsets & offsets = column_string.getOffsets();
62
63 UInt64 size;
64 readVarUInt(size, istr);
65
66 size_t old_chars_size = data.size();
67 size_t offset = old_chars_size + size + 1;
68 offsets.push_back(offset);
69
70 try
71 {
72 data.resize(offset);
73 istr.readStrict(reinterpret_cast<char*>(&data[offset - size - 1]), size);
74 data.back() = 0;
75 }
76 catch (...)
77 {
78 offsets.pop_back();
79 data.resize_assume_reserved(old_chars_size);
80 throw;
81 }
82}
83
84
85void DataTypeString::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const
86{
87 const ColumnString & column_string = typeid_cast<const ColumnString &>(column);
88 const ColumnString::Chars & data = column_string.getChars();
89 const ColumnString::Offsets & offsets = column_string.getOffsets();
90
91 size_t size = column.size();
92 if (!size)
93 return;
94
95 size_t end = limit && offset + limit < size
96 ? offset + limit
97 : size;
98
99 if (offset == 0)
100 {
101 UInt64 str_size = offsets[0] - 1;
102 writeVarUInt(str_size, ostr);
103 ostr.write(reinterpret_cast<const char *>(data.data()), str_size);
104
105 ++offset;
106 }
107
108 for (size_t i = offset; i < end; ++i)
109 {
110 UInt64 str_size = offsets[i] - offsets[i - 1] - 1;
111 writeVarUInt(str_size, ostr);
112 ostr.write(reinterpret_cast<const char *>(&data[offsets[i - 1]]), str_size);
113 }
114}
115
116
117template <int UNROLL_TIMES>
118static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars & data, ColumnString::Offsets & offsets, ReadBuffer & istr, size_t limit)
119{
120 size_t offset = data.size();
121 for (size_t i = 0; i < limit; ++i)
122 {
123 if (istr.eof())
124 break;
125
126 UInt64 size;
127 readVarUInt(size, istr);
128
129 offset += size + 1;
130 offsets.push_back(offset);
131
132 data.resize(offset);
133
134 if (size)
135 {
136#ifdef __SSE2__
137 /// An optimistic branch in which more efficient copying is possible.
138 if (offset + 16 * UNROLL_TIMES <= data.capacity() && istr.position() + size + 16 * UNROLL_TIMES <= istr.buffer().end())
139 {
140 const __m128i * sse_src_pos = reinterpret_cast<const __m128i *>(istr.position());
141 const __m128i * sse_src_end = sse_src_pos + (size + (16 * UNROLL_TIMES - 1)) / 16 / UNROLL_TIMES * UNROLL_TIMES;
142 __m128i * sse_dst_pos = reinterpret_cast<__m128i *>(&data[offset - size - 1]);
143
144 while (sse_src_pos < sse_src_end)
145 {
146 for (size_t j = 0; j < UNROLL_TIMES; ++j)
147 _mm_storeu_si128(sse_dst_pos + j, _mm_loadu_si128(sse_src_pos + j));
148
149 sse_src_pos += UNROLL_TIMES;
150 sse_dst_pos += UNROLL_TIMES;
151 }
152
153 istr.position() += size;
154 }
155 else
156#endif
157 {
158 istr.readStrict(reinterpret_cast<char*>(&data[offset - size - 1]), size);
159 }
160 }
161
162 data[offset - 1] = 0;
163 }
164}
165
166
167void DataTypeString::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const
168{
169 ColumnString & column_string = typeid_cast<ColumnString &>(column);
170 ColumnString::Chars & data = column_string.getChars();
171 ColumnString::Offsets & offsets = column_string.getOffsets();
172
173 double avg_chars_size = 1; /// By default reserve only for empty strings.
174
175 if (avg_value_size_hint && avg_value_size_hint > sizeof(offsets[0]))
176 {
177 /// Randomly selected.
178 constexpr auto avg_value_size_hint_reserve_multiplier = 1.2;
179
180 avg_chars_size = (avg_value_size_hint - sizeof(offsets[0])) * avg_value_size_hint_reserve_multiplier;
181 }
182
183 size_t size_to_reserve = data.size() + std::ceil(limit * avg_chars_size);
184
185 /// Never reserve for too big size.
186 if (size_to_reserve < 256 * 1024 * 1024)
187 {
188 try
189 {
190 data.reserve(size_to_reserve);
191 }
192 catch (Exception & e)
193 {
194 e.addMessage(
195 "(avg_value_size_hint = " + toString(avg_value_size_hint)
196 + ", avg_chars_size = " + toString(avg_chars_size)
197 + ", limit = " + toString(limit) + ")");
198 throw;
199 }
200 }
201
202 offsets.reserve(offsets.size() + limit);
203
204 if (avg_chars_size >= 64)
205 deserializeBinarySSE2<4>(data, offsets, istr, limit);
206 else if (avg_chars_size >= 48)
207 deserializeBinarySSE2<3>(data, offsets, istr, limit);
208 else if (avg_chars_size >= 32)
209 deserializeBinarySSE2<2>(data, offsets, istr, limit);
210 else
211 deserializeBinarySSE2<1>(data, offsets, istr, limit);
212}
213
214
215void DataTypeString::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const
216{
217 writeString(assert_cast<const ColumnString &>(column).getDataAt(row_num), ostr);
218}
219
220
221void DataTypeString::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const
222{
223 writeEscapedString(assert_cast<const ColumnString &>(column).getDataAt(row_num), ostr);
224}
225
226
227template <typename Reader>
228static inline void read(IColumn & column, Reader && reader)
229{
230 ColumnString & column_string = assert_cast<ColumnString &>(column);
231 ColumnString::Chars & data = column_string.getChars();
232 ColumnString::Offsets & offsets = column_string.getOffsets();
233 size_t old_chars_size = data.size();
234 size_t old_offsets_size = offsets.size();
235 try
236 {
237 reader(data);
238 data.push_back(0);
239 offsets.push_back(data.size());
240 }
241 catch (...)
242 {
243 offsets.resize_assume_reserved(old_offsets_size);
244 data.resize_assume_reserved(old_chars_size);
245 throw;
246 }
247}
248
249
250void DataTypeString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
251{
252 read(column, [&](ColumnString::Chars & data) { readStringInto(data, istr); });
253}
254
255
256void DataTypeString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
257{
258 read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); });
259}
260
261
262void DataTypeString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const
263{
264 writeQuotedString(assert_cast<const ColumnString &>(column).getDataAt(row_num), ostr);
265}
266
267
268void DataTypeString::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
269{
270 read(column, [&](ColumnString::Chars & data) { readQuotedStringInto<true>(data, istr); });
271}
272
273
274void DataTypeString::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
275{
276 writeJSONString(assert_cast<const ColumnString &>(column).getDataAt(row_num), ostr, settings);
277}
278
279
280void DataTypeString::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
281{
282 read(column, [&](ColumnString::Chars & data) { readJSONStringInto(data, istr); });
283}
284
285
286void DataTypeString::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const
287{
288 writeXMLString(assert_cast<const ColumnString &>(column).getDataAt(row_num), ostr);
289}
290
291
292void DataTypeString::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const
293{
294 writeCSVString<>(assert_cast<const ColumnString &>(column).getDataAt(row_num), ostr);
295}
296
297
298void DataTypeString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
299{
300 read(column, [&](ColumnString::Chars & data) { readCSVStringInto(data, istr, settings.csv); });
301}
302
303
304void DataTypeString::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const
305{
306 if (value_index)
307 return;
308 value_index = static_cast<bool>(protobuf.writeString(assert_cast<const ColumnString &>(column).getDataAt(row_num)));
309}
310
311
312void DataTypeString::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const
313{
314 row_added = false;
315 auto & column_string = assert_cast<ColumnString &>(column);
316 ColumnString::Chars & data = column_string.getChars();
317 ColumnString::Offsets & offsets = column_string.getOffsets();
318 size_t old_size = offsets.size();
319 try
320 {
321 if (allow_add_row)
322 {
323 if (protobuf.readStringInto(data))
324 {
325 data.emplace_back(0);
326 offsets.emplace_back(data.size());
327 row_added = true;
328 }
329 else
330 data.resize_assume_reserved(offsets.back());
331 }
332 else
333 {
334 ColumnString::Chars temp_data;
335 if (protobuf.readStringInto(temp_data))
336 {
337 temp_data.emplace_back(0);
338 column_string.popBack(1);
339 old_size = offsets.size();
340 data.insertSmallAllowReadWriteOverflow15(temp_data.begin(), temp_data.end());
341 offsets.emplace_back(data.size());
342 }
343 }
344 }
345 catch (...)
346 {
347 offsets.resize_assume_reserved(old_size);
348 data.resize_assume_reserved(offsets.back());
349 throw;
350 }
351}
352
353Field DataTypeString::getDefault() const
354{
355 return String();
356}
357
358MutableColumnPtr DataTypeString::createColumn() const
359{
360 return ColumnString::create();
361}
362
363
364bool DataTypeString::equals(const IDataType & rhs) const
365{
366 return typeid(rhs) == typeid(*this);
367}
368
369
370void registerDataTypeString(DataTypeFactory & factory)
371{
372 const auto & creator = [&] (const String & type_name) { return std::make_shared<DataTypeString>(type_name); };
373
374 factory.registerSimpleDataType("String", creator);
375
376 /// These synonyms are added for compatibility.
377
378 factory.registerAlias("CHAR", "String", DataTypeFactory::CaseInsensitive);
379 factory.registerAlias("VARCHAR", "String", DataTypeFactory::CaseInsensitive);
380 factory.registerAlias("TEXT", "String", DataTypeFactory::CaseInsensitive);
381 factory.registerAlias("TINYTEXT", "String", DataTypeFactory::CaseInsensitive);
382 factory.registerAlias("MEDIUMTEXT", "String", DataTypeFactory::CaseInsensitive);
383 factory.registerAlias("LONGTEXT", "String", DataTypeFactory::CaseInsensitive);
384 factory.registerAlias("BLOB", "String", DataTypeFactory::CaseInsensitive);
385 factory.registerAlias("TINYBLOB", "String", DataTypeFactory::CaseInsensitive);
386 factory.registerAlias("MEDIUMBLOB", "String", DataTypeFactory::CaseInsensitive);
387 factory.registerAlias("LONGBLOB", "String", DataTypeFactory::CaseInsensitive);
388}
389
390}
391