1 | #include <Core/Defines.h> |
2 | #include <Common/Arena.h> |
3 | #include <Common/memcmpSmall.h> |
4 | #include <Common/assert_cast.h> |
5 | #include <Columns/Collator.h> |
6 | #include <Columns/ColumnString.h> |
7 | #include <Columns/ColumnsCommon.h> |
8 | #include <DataStreams/ColumnGathererStream.h> |
9 | |
10 | #include <common/unaligned.h> |
11 | |
12 | |
13 | namespace DB |
14 | { |
15 | |
16 | namespace ErrorCodes |
17 | { |
18 | extern const int PARAMETER_OUT_OF_BOUND; |
19 | extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; |
20 | } |
21 | |
22 | |
23 | MutableColumnPtr ColumnString::cloneResized(size_t to_size) const |
24 | { |
25 | auto res = ColumnString::create(); |
26 | |
27 | if (to_size == 0) |
28 | return res; |
29 | |
30 | size_t from_size = size(); |
31 | |
32 | if (to_size <= from_size) |
33 | { |
34 | /// Just cut column. |
35 | |
36 | res->offsets.assign(offsets.begin(), offsets.begin() + to_size); |
37 | res->chars.assign(chars.begin(), chars.begin() + offsets[to_size - 1]); |
38 | } |
39 | else |
40 | { |
41 | /// Copy column and append empty strings for extra elements. |
42 | |
43 | Offset offset = 0; |
44 | if (from_size > 0) |
45 | { |
46 | res->offsets.assign(offsets.begin(), offsets.end()); |
47 | res->chars.assign(chars.begin(), chars.end()); |
48 | offset = offsets.back(); |
49 | } |
50 | |
51 | /// Empty strings are just zero terminating bytes. |
52 | |
53 | res->chars.resize_fill(res->chars.size() + to_size - from_size); |
54 | |
55 | res->offsets.resize(to_size); |
56 | for (size_t i = from_size; i < to_size; ++i) |
57 | { |
58 | ++offset; |
59 | res->offsets[i] = offset; |
60 | } |
61 | } |
62 | |
63 | return res; |
64 | } |
65 | |
66 | |
67 | void ColumnString::insertRangeFrom(const IColumn & src, size_t start, size_t length) |
68 | { |
69 | if (length == 0) |
70 | return; |
71 | |
72 | const ColumnString & src_concrete = assert_cast<const ColumnString &>(src); |
73 | |
74 | if (start + length > src_concrete.offsets.size()) |
75 | throw Exception("Parameter out of bound in IColumnString::insertRangeFrom method." , |
76 | ErrorCodes::PARAMETER_OUT_OF_BOUND); |
77 | |
78 | size_t nested_offset = src_concrete.offsetAt(start); |
79 | size_t nested_length = src_concrete.offsets[start + length - 1] - nested_offset; |
80 | |
81 | size_t old_chars_size = chars.size(); |
82 | chars.resize(old_chars_size + nested_length); |
83 | memcpy(&chars[old_chars_size], &src_concrete.chars[nested_offset], nested_length); |
84 | |
85 | if (start == 0 && offsets.empty()) |
86 | { |
87 | offsets.assign(src_concrete.offsets.begin(), src_concrete.offsets.begin() + length); |
88 | } |
89 | else |
90 | { |
91 | size_t old_size = offsets.size(); |
92 | size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray |
93 | offsets.resize(old_size + length); |
94 | |
95 | for (size_t i = 0; i < length; ++i) |
96 | offsets[old_size + i] = src_concrete.offsets[start + i] - nested_offset + prev_max_offset; |
97 | } |
98 | } |
99 | |
100 | |
101 | ColumnPtr ColumnString::filter(const Filter & filt, ssize_t result_size_hint) const |
102 | { |
103 | if (offsets.size() == 0) |
104 | return ColumnString::create(); |
105 | |
106 | auto res = ColumnString::create(); |
107 | |
108 | Chars & res_chars = res->chars; |
109 | Offsets & res_offsets = res->offsets; |
110 | |
111 | filterArraysImpl<UInt8>(chars, offsets, res_chars, res_offsets, filt, result_size_hint); |
112 | return res; |
113 | } |
114 | |
115 | |
116 | ColumnPtr ColumnString::permute(const Permutation & perm, size_t limit) const |
117 | { |
118 | size_t size = offsets.size(); |
119 | |
120 | if (limit == 0) |
121 | limit = size; |
122 | else |
123 | limit = std::min(size, limit); |
124 | |
125 | if (perm.size() < limit) |
126 | throw Exception("Size of permutation is less than required." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
127 | |
128 | if (limit == 0) |
129 | return ColumnString::create(); |
130 | |
131 | auto res = ColumnString::create(); |
132 | |
133 | Chars & res_chars = res->chars; |
134 | Offsets & res_offsets = res->offsets; |
135 | |
136 | if (limit == size) |
137 | res_chars.resize(chars.size()); |
138 | else |
139 | { |
140 | size_t new_chars_size = 0; |
141 | for (size_t i = 0; i < limit; ++i) |
142 | new_chars_size += sizeAt(perm[i]); |
143 | res_chars.resize(new_chars_size); |
144 | } |
145 | |
146 | res_offsets.resize(limit); |
147 | |
148 | Offset current_new_offset = 0; |
149 | |
150 | for (size_t i = 0; i < limit; ++i) |
151 | { |
152 | size_t j = perm[i]; |
153 | size_t string_offset = offsets[j - 1]; |
154 | size_t string_size = offsets[j] - string_offset; |
155 | |
156 | memcpySmallAllowReadWriteOverflow15(&res_chars[current_new_offset], &chars[string_offset], string_size); |
157 | |
158 | current_new_offset += string_size; |
159 | res_offsets[i] = current_new_offset; |
160 | } |
161 | |
162 | return res; |
163 | } |
164 | |
165 | |
166 | StringRef ColumnString::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const |
167 | { |
168 | size_t string_size = sizeAt(n); |
169 | size_t offset = offsetAt(n); |
170 | |
171 | StringRef res; |
172 | res.size = sizeof(string_size) + string_size; |
173 | char * pos = arena.allocContinue(res.size, begin); |
174 | memcpy(pos, &string_size, sizeof(string_size)); |
175 | memcpy(pos + sizeof(string_size), &chars[offset], string_size); |
176 | res.data = pos; |
177 | |
178 | return res; |
179 | } |
180 | |
181 | const char * ColumnString::deserializeAndInsertFromArena(const char * pos) |
182 | { |
183 | const size_t string_size = unalignedLoad<size_t>(pos); |
184 | pos += sizeof(string_size); |
185 | |
186 | const size_t old_size = chars.size(); |
187 | const size_t new_size = old_size + string_size; |
188 | chars.resize(new_size); |
189 | memcpy(chars.data() + old_size, pos, string_size); |
190 | |
191 | offsets.push_back(new_size); |
192 | return pos + string_size; |
193 | } |
194 | |
195 | |
196 | ColumnPtr ColumnString::index(const IColumn & indexes, size_t limit) const |
197 | { |
198 | return selectIndexImpl(*this, indexes, limit); |
199 | } |
200 | |
201 | template <typename Type> |
202 | ColumnPtr ColumnString::indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const |
203 | { |
204 | if (limit == 0) |
205 | return ColumnString::create(); |
206 | |
207 | auto res = ColumnString::create(); |
208 | |
209 | Chars & res_chars = res->chars; |
210 | Offsets & res_offsets = res->offsets; |
211 | |
212 | size_t new_chars_size = 0; |
213 | for (size_t i = 0; i < limit; ++i) |
214 | new_chars_size += sizeAt(indexes[i]); |
215 | res_chars.resize(new_chars_size); |
216 | |
217 | res_offsets.resize(limit); |
218 | |
219 | Offset current_new_offset = 0; |
220 | |
221 | for (size_t i = 0; i < limit; ++i) |
222 | { |
223 | size_t j = indexes[i]; |
224 | size_t string_offset = offsets[j - 1]; |
225 | size_t string_size = offsets[j] - string_offset; |
226 | |
227 | memcpySmallAllowReadWriteOverflow15(&res_chars[current_new_offset], &chars[string_offset], string_size); |
228 | |
229 | current_new_offset += string_size; |
230 | res_offsets[i] = current_new_offset; |
231 | } |
232 | |
233 | return res; |
234 | } |
235 | |
236 | |
237 | template <bool positive> |
238 | struct ColumnString::less |
239 | { |
240 | const ColumnString & parent; |
241 | explicit less(const ColumnString & parent_) : parent(parent_) {} |
242 | bool operator()(size_t lhs, size_t rhs) const |
243 | { |
244 | int res = memcmpSmallAllowOverflow15( |
245 | parent.chars.data() + parent.offsetAt(lhs), parent.sizeAt(lhs) - 1, |
246 | parent.chars.data() + parent.offsetAt(rhs), parent.sizeAt(rhs) - 1); |
247 | |
248 | return positive ? (res < 0) : (res > 0); |
249 | } |
250 | }; |
251 | |
252 | void ColumnString::getPermutation(bool reverse, size_t limit, int /*nan_direction_hint*/, Permutation & res) const |
253 | { |
254 | size_t s = offsets.size(); |
255 | res.resize(s); |
256 | for (size_t i = 0; i < s; ++i) |
257 | res[i] = i; |
258 | |
259 | if (limit >= s) |
260 | limit = 0; |
261 | |
262 | if (limit) |
263 | { |
264 | if (reverse) |
265 | std::partial_sort(res.begin(), res.begin() + limit, res.end(), less<false>(*this)); |
266 | else |
267 | std::partial_sort(res.begin(), res.begin() + limit, res.end(), less<true>(*this)); |
268 | } |
269 | else |
270 | { |
271 | if (reverse) |
272 | std::sort(res.begin(), res.end(), less<false>(*this)); |
273 | else |
274 | std::sort(res.begin(), res.end(), less<true>(*this)); |
275 | } |
276 | } |
277 | |
278 | |
279 | ColumnPtr ColumnString::replicate(const Offsets & replicate_offsets) const |
280 | { |
281 | size_t col_size = size(); |
282 | if (col_size != replicate_offsets.size()) |
283 | throw Exception("Size of offsets doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
284 | |
285 | auto res = ColumnString::create(); |
286 | |
287 | if (0 == col_size) |
288 | return res; |
289 | |
290 | Chars & res_chars = res->chars; |
291 | Offsets & res_offsets = res->offsets; |
292 | res_chars.reserve(chars.size() / col_size * replicate_offsets.back()); |
293 | res_offsets.reserve(replicate_offsets.back()); |
294 | |
295 | Offset prev_replicate_offset = 0; |
296 | Offset prev_string_offset = 0; |
297 | Offset current_new_offset = 0; |
298 | |
299 | for (size_t i = 0; i < col_size; ++i) |
300 | { |
301 | size_t size_to_replicate = replicate_offsets[i] - prev_replicate_offset; |
302 | size_t string_size = offsets[i] - prev_string_offset; |
303 | |
304 | for (size_t j = 0; j < size_to_replicate; ++j) |
305 | { |
306 | current_new_offset += string_size; |
307 | res_offsets.push_back(current_new_offset); |
308 | |
309 | res_chars.resize(res_chars.size() + string_size); |
310 | memcpySmallAllowReadWriteOverflow15( |
311 | &res_chars[res_chars.size() - string_size], &chars[prev_string_offset], string_size); |
312 | } |
313 | |
314 | prev_replicate_offset = replicate_offsets[i]; |
315 | prev_string_offset = offsets[i]; |
316 | } |
317 | |
318 | return res; |
319 | } |
320 | |
321 | |
322 | void ColumnString::gather(ColumnGathererStream & gatherer) |
323 | { |
324 | gatherer.gather(*this); |
325 | } |
326 | |
327 | |
328 | void ColumnString::reserve(size_t n) |
329 | { |
330 | offsets.reserve(n); |
331 | } |
332 | |
333 | |
334 | void ColumnString::getExtremes(Field & min, Field & max) const |
335 | { |
336 | min = String(); |
337 | max = String(); |
338 | |
339 | size_t col_size = size(); |
340 | |
341 | if (col_size == 0) |
342 | return; |
343 | |
344 | size_t min_idx = 0; |
345 | size_t max_idx = 0; |
346 | |
347 | less<true> less_op(*this); |
348 | |
349 | for (size_t i = 1; i < col_size; ++i) |
350 | { |
351 | if (less_op(i, min_idx)) |
352 | min_idx = i; |
353 | else if (less_op(max_idx, i)) |
354 | max_idx = i; |
355 | } |
356 | |
357 | get(min_idx, min); |
358 | get(max_idx, max); |
359 | } |
360 | |
361 | |
362 | int ColumnString::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, const Collator & collator) const |
363 | { |
364 | const ColumnString & rhs = assert_cast<const ColumnString &>(rhs_); |
365 | |
366 | return collator.compare( |
367 | reinterpret_cast<const char *>(&chars[offsetAt(n)]), sizeAt(n), |
368 | reinterpret_cast<const char *>(&rhs.chars[rhs.offsetAt(m)]), rhs.sizeAt(m)); |
369 | } |
370 | |
371 | |
372 | template <bool positive> |
373 | struct ColumnString::lessWithCollation |
374 | { |
375 | const ColumnString & parent; |
376 | const Collator & collator; |
377 | |
378 | lessWithCollation(const ColumnString & parent_, const Collator & collator_) : parent(parent_), collator(collator_) {} |
379 | |
380 | bool operator()(size_t lhs, size_t rhs) const |
381 | { |
382 | int res = collator.compare( |
383 | reinterpret_cast<const char *>(&parent.chars[parent.offsetAt(lhs)]), parent.sizeAt(lhs), |
384 | reinterpret_cast<const char *>(&parent.chars[parent.offsetAt(rhs)]), parent.sizeAt(rhs)); |
385 | |
386 | return positive ? (res < 0) : (res > 0); |
387 | } |
388 | }; |
389 | |
390 | void ColumnString::getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, Permutation & res) const |
391 | { |
392 | size_t s = offsets.size(); |
393 | res.resize(s); |
394 | for (size_t i = 0; i < s; ++i) |
395 | res[i] = i; |
396 | |
397 | if (limit >= s) |
398 | limit = 0; |
399 | |
400 | if (limit) |
401 | { |
402 | if (reverse) |
403 | std::partial_sort(res.begin(), res.begin() + limit, res.end(), lessWithCollation<false>(*this, collator)); |
404 | else |
405 | std::partial_sort(res.begin(), res.begin() + limit, res.end(), lessWithCollation<true>(*this, collator)); |
406 | } |
407 | else |
408 | { |
409 | if (reverse) |
410 | std::sort(res.begin(), res.end(), lessWithCollation<false>(*this, collator)); |
411 | else |
412 | std::sort(res.begin(), res.end(), lessWithCollation<true>(*this, collator)); |
413 | } |
414 | } |
415 | |
416 | |
417 | void ColumnString::protect() |
418 | { |
419 | getChars().protect(); |
420 | getOffsets().protect(); |
421 | } |
422 | |
423 | } |
424 | |