1 | #include <Columns/ColumnFixedString.h> |
2 | #include <Columns/ColumnsCommon.h> |
3 | |
4 | #include <Common/Arena.h> |
5 | #include <Common/SipHash.h> |
6 | #include <Common/memcpySmall.h> |
7 | #include <Common/memcmpSmall.h> |
8 | #include <Common/assert_cast.h> |
9 | |
10 | #include <DataStreams/ColumnGathererStream.h> |
11 | |
12 | #include <IO/WriteHelpers.h> |
13 | |
14 | #ifdef __SSE2__ |
15 | #include <emmintrin.h> |
16 | #endif |
17 | |
18 | |
19 | namespace DB |
20 | { |
21 | |
22 | namespace ErrorCodes |
23 | { |
24 | extern const int TOO_LARGE_STRING_SIZE; |
25 | extern const int SIZE_OF_FIXED_STRING_DOESNT_MATCH; |
26 | extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; |
27 | extern const int PARAMETER_OUT_OF_BOUND; |
28 | } |
29 | |
30 | |
31 | MutableColumnPtr ColumnFixedString::cloneResized(size_t size) const |
32 | { |
33 | MutableColumnPtr new_col_holder = ColumnFixedString::create(n); |
34 | |
35 | if (size > 0) |
36 | { |
37 | auto & new_col = assert_cast<ColumnFixedString &>(*new_col_holder); |
38 | new_col.chars.resize(size * n); |
39 | |
40 | size_t count = std::min(this->size(), size); |
41 | memcpy(new_col.chars.data(), chars.data(), count * n * sizeof(chars[0])); |
42 | |
43 | if (size > count) |
44 | memset(&(new_col.chars[count * n]), '\0', (size - count) * n); |
45 | } |
46 | |
47 | return new_col_holder; |
48 | } |
49 | |
50 | void ColumnFixedString::insert(const Field & x) |
51 | { |
52 | const String & s = DB::get<const String &>(x); |
53 | |
54 | if (s.size() > n) |
55 | throw Exception("Too large string '" + s + "' for FixedString column" , ErrorCodes::TOO_LARGE_STRING_SIZE); |
56 | |
57 | size_t old_size = chars.size(); |
58 | chars.resize_fill(old_size + n); |
59 | memcpy(chars.data() + old_size, s.data(), s.size()); |
60 | } |
61 | |
62 | void ColumnFixedString::insertFrom(const IColumn & src_, size_t index) |
63 | { |
64 | const ColumnFixedString & src = assert_cast<const ColumnFixedString &>(src_); |
65 | |
66 | if (n != src.getN()) |
67 | throw Exception("Size of FixedString doesn't match" , ErrorCodes::SIZE_OF_FIXED_STRING_DOESNT_MATCH); |
68 | |
69 | size_t old_size = chars.size(); |
70 | chars.resize(old_size + n); |
71 | memcpySmallAllowReadWriteOverflow15(chars.data() + old_size, &src.chars[n * index], n); |
72 | } |
73 | |
74 | void ColumnFixedString::insertData(const char * pos, size_t length) |
75 | { |
76 | if (length > n) |
77 | throw Exception("Too large string for FixedString column" , ErrorCodes::TOO_LARGE_STRING_SIZE); |
78 | |
79 | size_t old_size = chars.size(); |
80 | chars.resize_fill(old_size + n); |
81 | memcpy(chars.data() + old_size, pos, length); |
82 | } |
83 | |
84 | StringRef ColumnFixedString::serializeValueIntoArena(size_t index, Arena & arena, char const *& begin) const |
85 | { |
86 | auto pos = arena.allocContinue(n, begin); |
87 | memcpy(pos, &chars[n * index], n); |
88 | return StringRef(pos, n); |
89 | } |
90 | |
91 | const char * ColumnFixedString::deserializeAndInsertFromArena(const char * pos) |
92 | { |
93 | size_t old_size = chars.size(); |
94 | chars.resize(old_size + n); |
95 | memcpy(chars.data() + old_size, pos, n); |
96 | return pos + n; |
97 | } |
98 | |
99 | void ColumnFixedString::updateHashWithValue(size_t index, SipHash & hash) const |
100 | { |
101 | hash.update(reinterpret_cast<const char *>(&chars[n * index]), n); |
102 | } |
103 | |
104 | template <bool positive> |
105 | struct ColumnFixedString::less |
106 | { |
107 | const ColumnFixedString & parent; |
108 | explicit less(const ColumnFixedString & parent_) : parent(parent_) {} |
109 | bool operator()(size_t lhs, size_t rhs) const |
110 | { |
111 | int res = memcmpSmallAllowOverflow15(parent.chars.data() + lhs * parent.n, parent.chars.data() + rhs * parent.n, parent.n); |
112 | return positive ? (res < 0) : (res > 0); |
113 | } |
114 | }; |
115 | |
116 | void ColumnFixedString::getPermutation(bool reverse, size_t limit, int /*nan_direction_hint*/, Permutation & res) const |
117 | { |
118 | size_t s = size(); |
119 | res.resize(s); |
120 | for (size_t i = 0; i < s; ++i) |
121 | res[i] = i; |
122 | |
123 | if (limit >= s) |
124 | limit = 0; |
125 | |
126 | if (limit) |
127 | { |
128 | if (reverse) |
129 | std::partial_sort(res.begin(), res.begin() + limit, res.end(), less<false>(*this)); |
130 | else |
131 | std::partial_sort(res.begin(), res.begin() + limit, res.end(), less<true>(*this)); |
132 | } |
133 | else |
134 | { |
135 | if (reverse) |
136 | std::sort(res.begin(), res.end(), less<false>(*this)); |
137 | else |
138 | std::sort(res.begin(), res.end(), less<true>(*this)); |
139 | } |
140 | } |
141 | |
142 | void ColumnFixedString::insertRangeFrom(const IColumn & src, size_t start, size_t length) |
143 | { |
144 | const ColumnFixedString & src_concrete = assert_cast<const ColumnFixedString &>(src); |
145 | |
146 | if (start + length > src_concrete.size()) |
147 | throw Exception("Parameters start = " |
148 | + toString(start) + ", length = " |
149 | + toString(length) + " are out of bound in ColumnFixedString::insertRangeFrom method" |
150 | " (size() = " + toString(src_concrete.size()) + ")." , |
151 | ErrorCodes::PARAMETER_OUT_OF_BOUND); |
152 | |
153 | size_t old_size = chars.size(); |
154 | chars.resize(old_size + length * n); |
155 | memcpy(chars.data() + old_size, &src_concrete.chars[start * n], length * n); |
156 | } |
157 | |
158 | ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result_size_hint) const |
159 | { |
160 | size_t col_size = size(); |
161 | if (col_size != filt.size()) |
162 | throw Exception("Size of filter doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
163 | |
164 | auto res = ColumnFixedString::create(n); |
165 | |
166 | if (result_size_hint) |
167 | res->chars.reserve(result_size_hint > 0 ? result_size_hint * n : chars.size()); |
168 | |
169 | const UInt8 * filt_pos = filt.data(); |
170 | const UInt8 * filt_end = filt_pos + col_size; |
171 | const UInt8 * data_pos = chars.data(); |
172 | |
173 | #ifdef __SSE2__ |
174 | /** A slightly more optimized version. |
175 | * Based on the assumption that often pieces of consecutive values |
176 | * completely pass or do not pass the filter. |
177 | * Therefore, we will optimistically check the parts of `SIMD_BYTES` values. |
178 | */ |
179 | |
180 | static constexpr size_t SIMD_BYTES = 16; |
181 | const __m128i zero16 = _mm_setzero_si128(); |
182 | const UInt8 * filt_end_sse = filt_pos + col_size / SIMD_BYTES * SIMD_BYTES; |
183 | const size_t chars_per_simd_elements = SIMD_BYTES * n; |
184 | |
185 | while (filt_pos < filt_end_sse) |
186 | { |
187 | int mask = _mm_movemask_epi8(_mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i *>(filt_pos)), zero16)); |
188 | |
189 | if (0 == mask) |
190 | { |
191 | /// Nothing is inserted. |
192 | data_pos += chars_per_simd_elements; |
193 | } |
194 | else if (0xFFFF == mask) |
195 | { |
196 | res->chars.insert(data_pos, data_pos + chars_per_simd_elements); |
197 | data_pos += chars_per_simd_elements; |
198 | } |
199 | else |
200 | { |
201 | size_t res_chars_size = res->chars.size(); |
202 | for (size_t i = 0; i < SIMD_BYTES; ++i) |
203 | { |
204 | if (filt_pos[i]) |
205 | { |
206 | res->chars.resize(res_chars_size + n); |
207 | memcpySmallAllowReadWriteOverflow15(&res->chars[res_chars_size], data_pos, n); |
208 | res_chars_size += n; |
209 | } |
210 | data_pos += n; |
211 | } |
212 | } |
213 | |
214 | filt_pos += SIMD_BYTES; |
215 | } |
216 | #endif |
217 | |
218 | size_t res_chars_size = res->chars.size(); |
219 | while (filt_pos < filt_end) |
220 | { |
221 | if (*filt_pos) |
222 | { |
223 | res->chars.resize(res_chars_size + n); |
224 | memcpySmallAllowReadWriteOverflow15(&res->chars[res_chars_size], data_pos, n); |
225 | res_chars_size += n; |
226 | } |
227 | |
228 | ++filt_pos; |
229 | data_pos += n; |
230 | } |
231 | |
232 | return res; |
233 | } |
234 | |
235 | ColumnPtr ColumnFixedString::permute(const Permutation & perm, size_t limit) const |
236 | { |
237 | size_t col_size = size(); |
238 | |
239 | if (limit == 0) |
240 | limit = col_size; |
241 | else |
242 | limit = std::min(col_size, limit); |
243 | |
244 | if (perm.size() < limit) |
245 | throw Exception("Size of permutation is less than required." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
246 | |
247 | if (limit == 0) |
248 | return ColumnFixedString::create(n); |
249 | |
250 | auto res = ColumnFixedString::create(n); |
251 | |
252 | Chars & res_chars = res->chars; |
253 | |
254 | res_chars.resize(n * limit); |
255 | |
256 | size_t offset = 0; |
257 | for (size_t i = 0; i < limit; ++i, offset += n) |
258 | memcpySmallAllowReadWriteOverflow15(&res_chars[offset], &chars[perm[i] * n], n); |
259 | |
260 | return res; |
261 | } |
262 | |
263 | |
264 | ColumnPtr ColumnFixedString::index(const IColumn & indexes, size_t limit) const |
265 | { |
266 | return selectIndexImpl(*this, indexes, limit); |
267 | } |
268 | |
269 | |
270 | template <typename Type> |
271 | ColumnPtr ColumnFixedString::indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const |
272 | { |
273 | if (limit == 0) |
274 | return ColumnFixedString::create(n); |
275 | |
276 | auto res = ColumnFixedString::create(n); |
277 | |
278 | Chars & res_chars = res->chars; |
279 | |
280 | res_chars.resize(n * limit); |
281 | |
282 | size_t offset = 0; |
283 | for (size_t i = 0; i < limit; ++i, offset += n) |
284 | memcpySmallAllowReadWriteOverflow15(&res_chars[offset], &chars[indexes[i] * n], n); |
285 | |
286 | return res; |
287 | } |
288 | |
289 | ColumnPtr ColumnFixedString::replicate(const Offsets & offsets) const |
290 | { |
291 | size_t col_size = size(); |
292 | if (col_size != offsets.size()) |
293 | throw Exception("Size of offsets doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
294 | |
295 | auto res = ColumnFixedString::create(n); |
296 | |
297 | if (0 == col_size) |
298 | return res; |
299 | |
300 | Chars & res_chars = res->chars; |
301 | res_chars.resize(n * offsets.back()); |
302 | |
303 | Offset curr_offset = 0; |
304 | for (size_t i = 0; i < col_size; ++i) |
305 | for (size_t next_offset = offsets[i]; curr_offset < next_offset; ++curr_offset) |
306 | memcpySmallAllowReadWriteOverflow15(&res->chars[curr_offset * n], &chars[i * n], n); |
307 | |
308 | return res; |
309 | } |
310 | |
311 | void ColumnFixedString::gather(ColumnGathererStream & gatherer) |
312 | { |
313 | gatherer.gather(*this); |
314 | } |
315 | |
316 | void ColumnFixedString::getExtremes(Field & min, Field & max) const |
317 | { |
318 | min = String(); |
319 | max = String(); |
320 | |
321 | size_t col_size = size(); |
322 | |
323 | if (col_size == 0) |
324 | return; |
325 | |
326 | size_t min_idx = 0; |
327 | size_t max_idx = 0; |
328 | |
329 | less<true> less_op(*this); |
330 | |
331 | for (size_t i = 1; i < col_size; ++i) |
332 | { |
333 | if (less_op(i, min_idx)) |
334 | min_idx = i; |
335 | else if (less_op(max_idx, i)) |
336 | max_idx = i; |
337 | } |
338 | |
339 | get(min_idx, min); |
340 | get(max_idx, max); |
341 | } |
342 | |
343 | } |
344 | |