| 1 | #include <Core/Defines.h> |
| 2 | #include <Common/Arena.h> |
| 3 | #include <Common/memcmpSmall.h> |
| 4 | #include <Common/assert_cast.h> |
| 5 | #include <Columns/Collator.h> |
| 6 | #include <Columns/ColumnString.h> |
| 7 | #include <Columns/ColumnsCommon.h> |
| 8 | #include <DataStreams/ColumnGathererStream.h> |
| 9 | |
| 10 | #include <common/unaligned.h> |
| 11 | |
| 12 | |
| 13 | namespace DB |
| 14 | { |
| 15 | |
| 16 | namespace ErrorCodes |
| 17 | { |
| 18 | extern const int PARAMETER_OUT_OF_BOUND; |
| 19 | extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; |
| 20 | } |
| 21 | |
| 22 | |
| 23 | MutableColumnPtr ColumnString::cloneResized(size_t to_size) const |
| 24 | { |
| 25 | auto res = ColumnString::create(); |
| 26 | |
| 27 | if (to_size == 0) |
| 28 | return res; |
| 29 | |
| 30 | size_t from_size = size(); |
| 31 | |
| 32 | if (to_size <= from_size) |
| 33 | { |
| 34 | /// Just cut column. |
| 35 | |
| 36 | res->offsets.assign(offsets.begin(), offsets.begin() + to_size); |
| 37 | res->chars.assign(chars.begin(), chars.begin() + offsets[to_size - 1]); |
| 38 | } |
| 39 | else |
| 40 | { |
| 41 | /// Copy column and append empty strings for extra elements. |
| 42 | |
| 43 | Offset offset = 0; |
| 44 | if (from_size > 0) |
| 45 | { |
| 46 | res->offsets.assign(offsets.begin(), offsets.end()); |
| 47 | res->chars.assign(chars.begin(), chars.end()); |
| 48 | offset = offsets.back(); |
| 49 | } |
| 50 | |
| 51 | /// Empty strings are just zero terminating bytes. |
| 52 | |
| 53 | res->chars.resize_fill(res->chars.size() + to_size - from_size); |
| 54 | |
| 55 | res->offsets.resize(to_size); |
| 56 | for (size_t i = from_size; i < to_size; ++i) |
| 57 | { |
| 58 | ++offset; |
| 59 | res->offsets[i] = offset; |
| 60 | } |
| 61 | } |
| 62 | |
| 63 | return res; |
| 64 | } |
| 65 | |
| 66 | |
| 67 | void ColumnString::insertRangeFrom(const IColumn & src, size_t start, size_t length) |
| 68 | { |
| 69 | if (length == 0) |
| 70 | return; |
| 71 | |
| 72 | const ColumnString & src_concrete = assert_cast<const ColumnString &>(src); |
| 73 | |
| 74 | if (start + length > src_concrete.offsets.size()) |
| 75 | throw Exception("Parameter out of bound in IColumnString::insertRangeFrom method." , |
| 76 | ErrorCodes::PARAMETER_OUT_OF_BOUND); |
| 77 | |
| 78 | size_t nested_offset = src_concrete.offsetAt(start); |
| 79 | size_t nested_length = src_concrete.offsets[start + length - 1] - nested_offset; |
| 80 | |
| 81 | size_t old_chars_size = chars.size(); |
| 82 | chars.resize(old_chars_size + nested_length); |
| 83 | memcpy(&chars[old_chars_size], &src_concrete.chars[nested_offset], nested_length); |
| 84 | |
| 85 | if (start == 0 && offsets.empty()) |
| 86 | { |
| 87 | offsets.assign(src_concrete.offsets.begin(), src_concrete.offsets.begin() + length); |
| 88 | } |
| 89 | else |
| 90 | { |
| 91 | size_t old_size = offsets.size(); |
| 92 | size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray |
| 93 | offsets.resize(old_size + length); |
| 94 | |
| 95 | for (size_t i = 0; i < length; ++i) |
| 96 | offsets[old_size + i] = src_concrete.offsets[start + i] - nested_offset + prev_max_offset; |
| 97 | } |
| 98 | } |
| 99 | |
| 100 | |
| 101 | ColumnPtr ColumnString::filter(const Filter & filt, ssize_t result_size_hint) const |
| 102 | { |
| 103 | if (offsets.size() == 0) |
| 104 | return ColumnString::create(); |
| 105 | |
| 106 | auto res = ColumnString::create(); |
| 107 | |
| 108 | Chars & res_chars = res->chars; |
| 109 | Offsets & res_offsets = res->offsets; |
| 110 | |
| 111 | filterArraysImpl<UInt8>(chars, offsets, res_chars, res_offsets, filt, result_size_hint); |
| 112 | return res; |
| 113 | } |
| 114 | |
| 115 | |
| 116 | ColumnPtr ColumnString::permute(const Permutation & perm, size_t limit) const |
| 117 | { |
| 118 | size_t size = offsets.size(); |
| 119 | |
| 120 | if (limit == 0) |
| 121 | limit = size; |
| 122 | else |
| 123 | limit = std::min(size, limit); |
| 124 | |
| 125 | if (perm.size() < limit) |
| 126 | throw Exception("Size of permutation is less than required." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
| 127 | |
| 128 | if (limit == 0) |
| 129 | return ColumnString::create(); |
| 130 | |
| 131 | auto res = ColumnString::create(); |
| 132 | |
| 133 | Chars & res_chars = res->chars; |
| 134 | Offsets & res_offsets = res->offsets; |
| 135 | |
| 136 | if (limit == size) |
| 137 | res_chars.resize(chars.size()); |
| 138 | else |
| 139 | { |
| 140 | size_t new_chars_size = 0; |
| 141 | for (size_t i = 0; i < limit; ++i) |
| 142 | new_chars_size += sizeAt(perm[i]); |
| 143 | res_chars.resize(new_chars_size); |
| 144 | } |
| 145 | |
| 146 | res_offsets.resize(limit); |
| 147 | |
| 148 | Offset current_new_offset = 0; |
| 149 | |
| 150 | for (size_t i = 0; i < limit; ++i) |
| 151 | { |
| 152 | size_t j = perm[i]; |
| 153 | size_t string_offset = offsets[j - 1]; |
| 154 | size_t string_size = offsets[j] - string_offset; |
| 155 | |
| 156 | memcpySmallAllowReadWriteOverflow15(&res_chars[current_new_offset], &chars[string_offset], string_size); |
| 157 | |
| 158 | current_new_offset += string_size; |
| 159 | res_offsets[i] = current_new_offset; |
| 160 | } |
| 161 | |
| 162 | return res; |
| 163 | } |
| 164 | |
| 165 | |
| 166 | StringRef ColumnString::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const |
| 167 | { |
| 168 | size_t string_size = sizeAt(n); |
| 169 | size_t offset = offsetAt(n); |
| 170 | |
| 171 | StringRef res; |
| 172 | res.size = sizeof(string_size) + string_size; |
| 173 | char * pos = arena.allocContinue(res.size, begin); |
| 174 | memcpy(pos, &string_size, sizeof(string_size)); |
| 175 | memcpy(pos + sizeof(string_size), &chars[offset], string_size); |
| 176 | res.data = pos; |
| 177 | |
| 178 | return res; |
| 179 | } |
| 180 | |
| 181 | const char * ColumnString::deserializeAndInsertFromArena(const char * pos) |
| 182 | { |
| 183 | const size_t string_size = unalignedLoad<size_t>(pos); |
| 184 | pos += sizeof(string_size); |
| 185 | |
| 186 | const size_t old_size = chars.size(); |
| 187 | const size_t new_size = old_size + string_size; |
| 188 | chars.resize(new_size); |
| 189 | memcpy(chars.data() + old_size, pos, string_size); |
| 190 | |
| 191 | offsets.push_back(new_size); |
| 192 | return pos + string_size; |
| 193 | } |
| 194 | |
| 195 | |
| 196 | ColumnPtr ColumnString::index(const IColumn & indexes, size_t limit) const |
| 197 | { |
| 198 | return selectIndexImpl(*this, indexes, limit); |
| 199 | } |
| 200 | |
| 201 | template <typename Type> |
| 202 | ColumnPtr ColumnString::indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const |
| 203 | { |
| 204 | if (limit == 0) |
| 205 | return ColumnString::create(); |
| 206 | |
| 207 | auto res = ColumnString::create(); |
| 208 | |
| 209 | Chars & res_chars = res->chars; |
| 210 | Offsets & res_offsets = res->offsets; |
| 211 | |
| 212 | size_t new_chars_size = 0; |
| 213 | for (size_t i = 0; i < limit; ++i) |
| 214 | new_chars_size += sizeAt(indexes[i]); |
| 215 | res_chars.resize(new_chars_size); |
| 216 | |
| 217 | res_offsets.resize(limit); |
| 218 | |
| 219 | Offset current_new_offset = 0; |
| 220 | |
| 221 | for (size_t i = 0; i < limit; ++i) |
| 222 | { |
| 223 | size_t j = indexes[i]; |
| 224 | size_t string_offset = offsets[j - 1]; |
| 225 | size_t string_size = offsets[j] - string_offset; |
| 226 | |
| 227 | memcpySmallAllowReadWriteOverflow15(&res_chars[current_new_offset], &chars[string_offset], string_size); |
| 228 | |
| 229 | current_new_offset += string_size; |
| 230 | res_offsets[i] = current_new_offset; |
| 231 | } |
| 232 | |
| 233 | return res; |
| 234 | } |
| 235 | |
| 236 | |
| 237 | template <bool positive> |
| 238 | struct ColumnString::less |
| 239 | { |
| 240 | const ColumnString & parent; |
| 241 | explicit less(const ColumnString & parent_) : parent(parent_) {} |
| 242 | bool operator()(size_t lhs, size_t rhs) const |
| 243 | { |
| 244 | int res = memcmpSmallAllowOverflow15( |
| 245 | parent.chars.data() + parent.offsetAt(lhs), parent.sizeAt(lhs) - 1, |
| 246 | parent.chars.data() + parent.offsetAt(rhs), parent.sizeAt(rhs) - 1); |
| 247 | |
| 248 | return positive ? (res < 0) : (res > 0); |
| 249 | } |
| 250 | }; |
| 251 | |
| 252 | void ColumnString::getPermutation(bool reverse, size_t limit, int /*nan_direction_hint*/, Permutation & res) const |
| 253 | { |
| 254 | size_t s = offsets.size(); |
| 255 | res.resize(s); |
| 256 | for (size_t i = 0; i < s; ++i) |
| 257 | res[i] = i; |
| 258 | |
| 259 | if (limit >= s) |
| 260 | limit = 0; |
| 261 | |
| 262 | if (limit) |
| 263 | { |
| 264 | if (reverse) |
| 265 | std::partial_sort(res.begin(), res.begin() + limit, res.end(), less<false>(*this)); |
| 266 | else |
| 267 | std::partial_sort(res.begin(), res.begin() + limit, res.end(), less<true>(*this)); |
| 268 | } |
| 269 | else |
| 270 | { |
| 271 | if (reverse) |
| 272 | std::sort(res.begin(), res.end(), less<false>(*this)); |
| 273 | else |
| 274 | std::sort(res.begin(), res.end(), less<true>(*this)); |
| 275 | } |
| 276 | } |
| 277 | |
| 278 | |
| 279 | ColumnPtr ColumnString::replicate(const Offsets & replicate_offsets) const |
| 280 | { |
| 281 | size_t col_size = size(); |
| 282 | if (col_size != replicate_offsets.size()) |
| 283 | throw Exception("Size of offsets doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
| 284 | |
| 285 | auto res = ColumnString::create(); |
| 286 | |
| 287 | if (0 == col_size) |
| 288 | return res; |
| 289 | |
| 290 | Chars & res_chars = res->chars; |
| 291 | Offsets & res_offsets = res->offsets; |
| 292 | res_chars.reserve(chars.size() / col_size * replicate_offsets.back()); |
| 293 | res_offsets.reserve(replicate_offsets.back()); |
| 294 | |
| 295 | Offset prev_replicate_offset = 0; |
| 296 | Offset prev_string_offset = 0; |
| 297 | Offset current_new_offset = 0; |
| 298 | |
| 299 | for (size_t i = 0; i < col_size; ++i) |
| 300 | { |
| 301 | size_t size_to_replicate = replicate_offsets[i] - prev_replicate_offset; |
| 302 | size_t string_size = offsets[i] - prev_string_offset; |
| 303 | |
| 304 | for (size_t j = 0; j < size_to_replicate; ++j) |
| 305 | { |
| 306 | current_new_offset += string_size; |
| 307 | res_offsets.push_back(current_new_offset); |
| 308 | |
| 309 | res_chars.resize(res_chars.size() + string_size); |
| 310 | memcpySmallAllowReadWriteOverflow15( |
| 311 | &res_chars[res_chars.size() - string_size], &chars[prev_string_offset], string_size); |
| 312 | } |
| 313 | |
| 314 | prev_replicate_offset = replicate_offsets[i]; |
| 315 | prev_string_offset = offsets[i]; |
| 316 | } |
| 317 | |
| 318 | return res; |
| 319 | } |
| 320 | |
| 321 | |
| 322 | void ColumnString::gather(ColumnGathererStream & gatherer) |
| 323 | { |
| 324 | gatherer.gather(*this); |
| 325 | } |
| 326 | |
| 327 | |
| 328 | void ColumnString::reserve(size_t n) |
| 329 | { |
| 330 | offsets.reserve(n); |
| 331 | } |
| 332 | |
| 333 | |
| 334 | void ColumnString::getExtremes(Field & min, Field & max) const |
| 335 | { |
| 336 | min = String(); |
| 337 | max = String(); |
| 338 | |
| 339 | size_t col_size = size(); |
| 340 | |
| 341 | if (col_size == 0) |
| 342 | return; |
| 343 | |
| 344 | size_t min_idx = 0; |
| 345 | size_t max_idx = 0; |
| 346 | |
| 347 | less<true> less_op(*this); |
| 348 | |
| 349 | for (size_t i = 1; i < col_size; ++i) |
| 350 | { |
| 351 | if (less_op(i, min_idx)) |
| 352 | min_idx = i; |
| 353 | else if (less_op(max_idx, i)) |
| 354 | max_idx = i; |
| 355 | } |
| 356 | |
| 357 | get(min_idx, min); |
| 358 | get(max_idx, max); |
| 359 | } |
| 360 | |
| 361 | |
| 362 | int ColumnString::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, const Collator & collator) const |
| 363 | { |
| 364 | const ColumnString & rhs = assert_cast<const ColumnString &>(rhs_); |
| 365 | |
| 366 | return collator.compare( |
| 367 | reinterpret_cast<const char *>(&chars[offsetAt(n)]), sizeAt(n), |
| 368 | reinterpret_cast<const char *>(&rhs.chars[rhs.offsetAt(m)]), rhs.sizeAt(m)); |
| 369 | } |
| 370 | |
| 371 | |
| 372 | template <bool positive> |
| 373 | struct ColumnString::lessWithCollation |
| 374 | { |
| 375 | const ColumnString & parent; |
| 376 | const Collator & collator; |
| 377 | |
| 378 | lessWithCollation(const ColumnString & parent_, const Collator & collator_) : parent(parent_), collator(collator_) {} |
| 379 | |
| 380 | bool operator()(size_t lhs, size_t rhs) const |
| 381 | { |
| 382 | int res = collator.compare( |
| 383 | reinterpret_cast<const char *>(&parent.chars[parent.offsetAt(lhs)]), parent.sizeAt(lhs), |
| 384 | reinterpret_cast<const char *>(&parent.chars[parent.offsetAt(rhs)]), parent.sizeAt(rhs)); |
| 385 | |
| 386 | return positive ? (res < 0) : (res > 0); |
| 387 | } |
| 388 | }; |
| 389 | |
| 390 | void ColumnString::getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, Permutation & res) const |
| 391 | { |
| 392 | size_t s = offsets.size(); |
| 393 | res.resize(s); |
| 394 | for (size_t i = 0; i < s; ++i) |
| 395 | res[i] = i; |
| 396 | |
| 397 | if (limit >= s) |
| 398 | limit = 0; |
| 399 | |
| 400 | if (limit) |
| 401 | { |
| 402 | if (reverse) |
| 403 | std::partial_sort(res.begin(), res.begin() + limit, res.end(), lessWithCollation<false>(*this, collator)); |
| 404 | else |
| 405 | std::partial_sort(res.begin(), res.begin() + limit, res.end(), lessWithCollation<true>(*this, collator)); |
| 406 | } |
| 407 | else |
| 408 | { |
| 409 | if (reverse) |
| 410 | std::sort(res.begin(), res.end(), lessWithCollation<false>(*this, collator)); |
| 411 | else |
| 412 | std::sort(res.begin(), res.end(), lessWithCollation<true>(*this, collator)); |
| 413 | } |
| 414 | } |
| 415 | |
| 416 | |
| 417 | void ColumnString::protect() |
| 418 | { |
| 419 | getChars().protect(); |
| 420 | getOffsets().protect(); |
| 421 | } |
| 422 | |
| 423 | } |
| 424 | |