1#include <Core/Defines.h>
2#include <Common/Arena.h>
3#include <Common/memcmpSmall.h>
4#include <Common/assert_cast.h>
5#include <Columns/Collator.h>
6#include <Columns/ColumnString.h>
7#include <Columns/ColumnsCommon.h>
8#include <DataStreams/ColumnGathererStream.h>
9
10#include <common/unaligned.h>
11
12
13namespace DB
14{
15
16namespace ErrorCodes
17{
18 extern const int PARAMETER_OUT_OF_BOUND;
19 extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
20}
21
22
23MutableColumnPtr ColumnString::cloneResized(size_t to_size) const
24{
25 auto res = ColumnString::create();
26
27 if (to_size == 0)
28 return res;
29
30 size_t from_size = size();
31
32 if (to_size <= from_size)
33 {
34 /// Just cut column.
35
36 res->offsets.assign(offsets.begin(), offsets.begin() + to_size);
37 res->chars.assign(chars.begin(), chars.begin() + offsets[to_size - 1]);
38 }
39 else
40 {
41 /// Copy column and append empty strings for extra elements.
42
43 Offset offset = 0;
44 if (from_size > 0)
45 {
46 res->offsets.assign(offsets.begin(), offsets.end());
47 res->chars.assign(chars.begin(), chars.end());
48 offset = offsets.back();
49 }
50
51 /// Empty strings are just zero terminating bytes.
52
53 res->chars.resize_fill(res->chars.size() + to_size - from_size);
54
55 res->offsets.resize(to_size);
56 for (size_t i = from_size; i < to_size; ++i)
57 {
58 ++offset;
59 res->offsets[i] = offset;
60 }
61 }
62
63 return res;
64}
65
66
67void ColumnString::insertRangeFrom(const IColumn & src, size_t start, size_t length)
68{
69 if (length == 0)
70 return;
71
72 const ColumnString & src_concrete = assert_cast<const ColumnString &>(src);
73
74 if (start + length > src_concrete.offsets.size())
75 throw Exception("Parameter out of bound in IColumnString::insertRangeFrom method.",
76 ErrorCodes::PARAMETER_OUT_OF_BOUND);
77
78 size_t nested_offset = src_concrete.offsetAt(start);
79 size_t nested_length = src_concrete.offsets[start + length - 1] - nested_offset;
80
81 size_t old_chars_size = chars.size();
82 chars.resize(old_chars_size + nested_length);
83 memcpy(&chars[old_chars_size], &src_concrete.chars[nested_offset], nested_length);
84
85 if (start == 0 && offsets.empty())
86 {
87 offsets.assign(src_concrete.offsets.begin(), src_concrete.offsets.begin() + length);
88 }
89 else
90 {
91 size_t old_size = offsets.size();
92 size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray
93 offsets.resize(old_size + length);
94
95 for (size_t i = 0; i < length; ++i)
96 offsets[old_size + i] = src_concrete.offsets[start + i] - nested_offset + prev_max_offset;
97 }
98}
99
100
101ColumnPtr ColumnString::filter(const Filter & filt, ssize_t result_size_hint) const
102{
103 if (offsets.size() == 0)
104 return ColumnString::create();
105
106 auto res = ColumnString::create();
107
108 Chars & res_chars = res->chars;
109 Offsets & res_offsets = res->offsets;
110
111 filterArraysImpl<UInt8>(chars, offsets, res_chars, res_offsets, filt, result_size_hint);
112 return res;
113}
114
115
116ColumnPtr ColumnString::permute(const Permutation & perm, size_t limit) const
117{
118 size_t size = offsets.size();
119
120 if (limit == 0)
121 limit = size;
122 else
123 limit = std::min(size, limit);
124
125 if (perm.size() < limit)
126 throw Exception("Size of permutation is less than required.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
127
128 if (limit == 0)
129 return ColumnString::create();
130
131 auto res = ColumnString::create();
132
133 Chars & res_chars = res->chars;
134 Offsets & res_offsets = res->offsets;
135
136 if (limit == size)
137 res_chars.resize(chars.size());
138 else
139 {
140 size_t new_chars_size = 0;
141 for (size_t i = 0; i < limit; ++i)
142 new_chars_size += sizeAt(perm[i]);
143 res_chars.resize(new_chars_size);
144 }
145
146 res_offsets.resize(limit);
147
148 Offset current_new_offset = 0;
149
150 for (size_t i = 0; i < limit; ++i)
151 {
152 size_t j = perm[i];
153 size_t string_offset = offsets[j - 1];
154 size_t string_size = offsets[j] - string_offset;
155
156 memcpySmallAllowReadWriteOverflow15(&res_chars[current_new_offset], &chars[string_offset], string_size);
157
158 current_new_offset += string_size;
159 res_offsets[i] = current_new_offset;
160 }
161
162 return res;
163}
164
165
166StringRef ColumnString::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const
167{
168 size_t string_size = sizeAt(n);
169 size_t offset = offsetAt(n);
170
171 StringRef res;
172 res.size = sizeof(string_size) + string_size;
173 char * pos = arena.allocContinue(res.size, begin);
174 memcpy(pos, &string_size, sizeof(string_size));
175 memcpy(pos + sizeof(string_size), &chars[offset], string_size);
176 res.data = pos;
177
178 return res;
179}
180
181const char * ColumnString::deserializeAndInsertFromArena(const char * pos)
182{
183 const size_t string_size = unalignedLoad<size_t>(pos);
184 pos += sizeof(string_size);
185
186 const size_t old_size = chars.size();
187 const size_t new_size = old_size + string_size;
188 chars.resize(new_size);
189 memcpy(chars.data() + old_size, pos, string_size);
190
191 offsets.push_back(new_size);
192 return pos + string_size;
193}
194
195
196ColumnPtr ColumnString::index(const IColumn & indexes, size_t limit) const
197{
198 return selectIndexImpl(*this, indexes, limit);
199}
200
201template <typename Type>
202ColumnPtr ColumnString::indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const
203{
204 if (limit == 0)
205 return ColumnString::create();
206
207 auto res = ColumnString::create();
208
209 Chars & res_chars = res->chars;
210 Offsets & res_offsets = res->offsets;
211
212 size_t new_chars_size = 0;
213 for (size_t i = 0; i < limit; ++i)
214 new_chars_size += sizeAt(indexes[i]);
215 res_chars.resize(new_chars_size);
216
217 res_offsets.resize(limit);
218
219 Offset current_new_offset = 0;
220
221 for (size_t i = 0; i < limit; ++i)
222 {
223 size_t j = indexes[i];
224 size_t string_offset = offsets[j - 1];
225 size_t string_size = offsets[j] - string_offset;
226
227 memcpySmallAllowReadWriteOverflow15(&res_chars[current_new_offset], &chars[string_offset], string_size);
228
229 current_new_offset += string_size;
230 res_offsets[i] = current_new_offset;
231 }
232
233 return res;
234}
235
236
237template <bool positive>
238struct ColumnString::less
239{
240 const ColumnString & parent;
241 explicit less(const ColumnString & parent_) : parent(parent_) {}
242 bool operator()(size_t lhs, size_t rhs) const
243 {
244 int res = memcmpSmallAllowOverflow15(
245 parent.chars.data() + parent.offsetAt(lhs), parent.sizeAt(lhs) - 1,
246 parent.chars.data() + parent.offsetAt(rhs), parent.sizeAt(rhs) - 1);
247
248 return positive ? (res < 0) : (res > 0);
249 }
250};
251
252void ColumnString::getPermutation(bool reverse, size_t limit, int /*nan_direction_hint*/, Permutation & res) const
253{
254 size_t s = offsets.size();
255 res.resize(s);
256 for (size_t i = 0; i < s; ++i)
257 res[i] = i;
258
259 if (limit >= s)
260 limit = 0;
261
262 if (limit)
263 {
264 if (reverse)
265 std::partial_sort(res.begin(), res.begin() + limit, res.end(), less<false>(*this));
266 else
267 std::partial_sort(res.begin(), res.begin() + limit, res.end(), less<true>(*this));
268 }
269 else
270 {
271 if (reverse)
272 std::sort(res.begin(), res.end(), less<false>(*this));
273 else
274 std::sort(res.begin(), res.end(), less<true>(*this));
275 }
276}
277
278
279ColumnPtr ColumnString::replicate(const Offsets & replicate_offsets) const
280{
281 size_t col_size = size();
282 if (col_size != replicate_offsets.size())
283 throw Exception("Size of offsets doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
284
285 auto res = ColumnString::create();
286
287 if (0 == col_size)
288 return res;
289
290 Chars & res_chars = res->chars;
291 Offsets & res_offsets = res->offsets;
292 res_chars.reserve(chars.size() / col_size * replicate_offsets.back());
293 res_offsets.reserve(replicate_offsets.back());
294
295 Offset prev_replicate_offset = 0;
296 Offset prev_string_offset = 0;
297 Offset current_new_offset = 0;
298
299 for (size_t i = 0; i < col_size; ++i)
300 {
301 size_t size_to_replicate = replicate_offsets[i] - prev_replicate_offset;
302 size_t string_size = offsets[i] - prev_string_offset;
303
304 for (size_t j = 0; j < size_to_replicate; ++j)
305 {
306 current_new_offset += string_size;
307 res_offsets.push_back(current_new_offset);
308
309 res_chars.resize(res_chars.size() + string_size);
310 memcpySmallAllowReadWriteOverflow15(
311 &res_chars[res_chars.size() - string_size], &chars[prev_string_offset], string_size);
312 }
313
314 prev_replicate_offset = replicate_offsets[i];
315 prev_string_offset = offsets[i];
316 }
317
318 return res;
319}
320
321
322void ColumnString::gather(ColumnGathererStream & gatherer)
323{
324 gatherer.gather(*this);
325}
326
327
328void ColumnString::reserve(size_t n)
329{
330 offsets.reserve(n);
331}
332
333
334void ColumnString::getExtremes(Field & min, Field & max) const
335{
336 min = String();
337 max = String();
338
339 size_t col_size = size();
340
341 if (col_size == 0)
342 return;
343
344 size_t min_idx = 0;
345 size_t max_idx = 0;
346
347 less<true> less_op(*this);
348
349 for (size_t i = 1; i < col_size; ++i)
350 {
351 if (less_op(i, min_idx))
352 min_idx = i;
353 else if (less_op(max_idx, i))
354 max_idx = i;
355 }
356
357 get(min_idx, min);
358 get(max_idx, max);
359}
360
361
362int ColumnString::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, const Collator & collator) const
363{
364 const ColumnString & rhs = assert_cast<const ColumnString &>(rhs_);
365
366 return collator.compare(
367 reinterpret_cast<const char *>(&chars[offsetAt(n)]), sizeAt(n),
368 reinterpret_cast<const char *>(&rhs.chars[rhs.offsetAt(m)]), rhs.sizeAt(m));
369}
370
371
372template <bool positive>
373struct ColumnString::lessWithCollation
374{
375 const ColumnString & parent;
376 const Collator & collator;
377
378 lessWithCollation(const ColumnString & parent_, const Collator & collator_) : parent(parent_), collator(collator_) {}
379
380 bool operator()(size_t lhs, size_t rhs) const
381 {
382 int res = collator.compare(
383 reinterpret_cast<const char *>(&parent.chars[parent.offsetAt(lhs)]), parent.sizeAt(lhs),
384 reinterpret_cast<const char *>(&parent.chars[parent.offsetAt(rhs)]), parent.sizeAt(rhs));
385
386 return positive ? (res < 0) : (res > 0);
387 }
388};
389
390void ColumnString::getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, Permutation & res) const
391{
392 size_t s = offsets.size();
393 res.resize(s);
394 for (size_t i = 0; i < s; ++i)
395 res[i] = i;
396
397 if (limit >= s)
398 limit = 0;
399
400 if (limit)
401 {
402 if (reverse)
403 std::partial_sort(res.begin(), res.begin() + limit, res.end(), lessWithCollation<false>(*this, collator));
404 else
405 std::partial_sort(res.begin(), res.begin() + limit, res.end(), lessWithCollation<true>(*this, collator));
406 }
407 else
408 {
409 if (reverse)
410 std::sort(res.begin(), res.end(), lessWithCollation<false>(*this, collator));
411 else
412 std::sort(res.begin(), res.end(), lessWithCollation<true>(*this, collator));
413 }
414}
415
416
417void ColumnString::protect()
418{
419 getChars().protect();
420 getOffsets().protect();
421}
422
423}
424