| 1 | #include <Columns/ColumnFixedString.h> |
| 2 | #include <Columns/ColumnsCommon.h> |
| 3 | |
| 4 | #include <Common/Arena.h> |
| 5 | #include <Common/SipHash.h> |
| 6 | #include <Common/memcpySmall.h> |
| 7 | #include <Common/memcmpSmall.h> |
| 8 | #include <Common/assert_cast.h> |
| 9 | |
| 10 | #include <DataStreams/ColumnGathererStream.h> |
| 11 | |
| 12 | #include <IO/WriteHelpers.h> |
| 13 | |
| 14 | #ifdef __SSE2__ |
| 15 | #include <emmintrin.h> |
| 16 | #endif |
| 17 | |
| 18 | |
| 19 | namespace DB |
| 20 | { |
| 21 | |
| 22 | namespace ErrorCodes |
| 23 | { |
| 24 | extern const int TOO_LARGE_STRING_SIZE; |
| 25 | extern const int SIZE_OF_FIXED_STRING_DOESNT_MATCH; |
| 26 | extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; |
| 27 | extern const int PARAMETER_OUT_OF_BOUND; |
| 28 | } |
| 29 | |
| 30 | |
| 31 | MutableColumnPtr ColumnFixedString::cloneResized(size_t size) const |
| 32 | { |
| 33 | MutableColumnPtr new_col_holder = ColumnFixedString::create(n); |
| 34 | |
| 35 | if (size > 0) |
| 36 | { |
| 37 | auto & new_col = assert_cast<ColumnFixedString &>(*new_col_holder); |
| 38 | new_col.chars.resize(size * n); |
| 39 | |
| 40 | size_t count = std::min(this->size(), size); |
| 41 | memcpy(new_col.chars.data(), chars.data(), count * n * sizeof(chars[0])); |
| 42 | |
| 43 | if (size > count) |
| 44 | memset(&(new_col.chars[count * n]), '\0', (size - count) * n); |
| 45 | } |
| 46 | |
| 47 | return new_col_holder; |
| 48 | } |
| 49 | |
| 50 | void ColumnFixedString::insert(const Field & x) |
| 51 | { |
| 52 | const String & s = DB::get<const String &>(x); |
| 53 | |
| 54 | if (s.size() > n) |
| 55 | throw Exception("Too large string '" + s + "' for FixedString column" , ErrorCodes::TOO_LARGE_STRING_SIZE); |
| 56 | |
| 57 | size_t old_size = chars.size(); |
| 58 | chars.resize_fill(old_size + n); |
| 59 | memcpy(chars.data() + old_size, s.data(), s.size()); |
| 60 | } |
| 61 | |
| 62 | void ColumnFixedString::insertFrom(const IColumn & src_, size_t index) |
| 63 | { |
| 64 | const ColumnFixedString & src = assert_cast<const ColumnFixedString &>(src_); |
| 65 | |
| 66 | if (n != src.getN()) |
| 67 | throw Exception("Size of FixedString doesn't match" , ErrorCodes::SIZE_OF_FIXED_STRING_DOESNT_MATCH); |
| 68 | |
| 69 | size_t old_size = chars.size(); |
| 70 | chars.resize(old_size + n); |
| 71 | memcpySmallAllowReadWriteOverflow15(chars.data() + old_size, &src.chars[n * index], n); |
| 72 | } |
| 73 | |
| 74 | void ColumnFixedString::insertData(const char * pos, size_t length) |
| 75 | { |
| 76 | if (length > n) |
| 77 | throw Exception("Too large string for FixedString column" , ErrorCodes::TOO_LARGE_STRING_SIZE); |
| 78 | |
| 79 | size_t old_size = chars.size(); |
| 80 | chars.resize_fill(old_size + n); |
| 81 | memcpy(chars.data() + old_size, pos, length); |
| 82 | } |
| 83 | |
| 84 | StringRef ColumnFixedString::serializeValueIntoArena(size_t index, Arena & arena, char const *& begin) const |
| 85 | { |
| 86 | auto pos = arena.allocContinue(n, begin); |
| 87 | memcpy(pos, &chars[n * index], n); |
| 88 | return StringRef(pos, n); |
| 89 | } |
| 90 | |
| 91 | const char * ColumnFixedString::deserializeAndInsertFromArena(const char * pos) |
| 92 | { |
| 93 | size_t old_size = chars.size(); |
| 94 | chars.resize(old_size + n); |
| 95 | memcpy(chars.data() + old_size, pos, n); |
| 96 | return pos + n; |
| 97 | } |
| 98 | |
| 99 | void ColumnFixedString::updateHashWithValue(size_t index, SipHash & hash) const |
| 100 | { |
| 101 | hash.update(reinterpret_cast<const char *>(&chars[n * index]), n); |
| 102 | } |
| 103 | |
| 104 | template <bool positive> |
| 105 | struct ColumnFixedString::less |
| 106 | { |
| 107 | const ColumnFixedString & parent; |
| 108 | explicit less(const ColumnFixedString & parent_) : parent(parent_) {} |
| 109 | bool operator()(size_t lhs, size_t rhs) const |
| 110 | { |
| 111 | int res = memcmpSmallAllowOverflow15(parent.chars.data() + lhs * parent.n, parent.chars.data() + rhs * parent.n, parent.n); |
| 112 | return positive ? (res < 0) : (res > 0); |
| 113 | } |
| 114 | }; |
| 115 | |
| 116 | void ColumnFixedString::getPermutation(bool reverse, size_t limit, int /*nan_direction_hint*/, Permutation & res) const |
| 117 | { |
| 118 | size_t s = size(); |
| 119 | res.resize(s); |
| 120 | for (size_t i = 0; i < s; ++i) |
| 121 | res[i] = i; |
| 122 | |
| 123 | if (limit >= s) |
| 124 | limit = 0; |
| 125 | |
| 126 | if (limit) |
| 127 | { |
| 128 | if (reverse) |
| 129 | std::partial_sort(res.begin(), res.begin() + limit, res.end(), less<false>(*this)); |
| 130 | else |
| 131 | std::partial_sort(res.begin(), res.begin() + limit, res.end(), less<true>(*this)); |
| 132 | } |
| 133 | else |
| 134 | { |
| 135 | if (reverse) |
| 136 | std::sort(res.begin(), res.end(), less<false>(*this)); |
| 137 | else |
| 138 | std::sort(res.begin(), res.end(), less<true>(*this)); |
| 139 | } |
| 140 | } |
| 141 | |
| 142 | void ColumnFixedString::insertRangeFrom(const IColumn & src, size_t start, size_t length) |
| 143 | { |
| 144 | const ColumnFixedString & src_concrete = assert_cast<const ColumnFixedString &>(src); |
| 145 | |
| 146 | if (start + length > src_concrete.size()) |
| 147 | throw Exception("Parameters start = " |
| 148 | + toString(start) + ", length = " |
| 149 | + toString(length) + " are out of bound in ColumnFixedString::insertRangeFrom method" |
| 150 | " (size() = " + toString(src_concrete.size()) + ")." , |
| 151 | ErrorCodes::PARAMETER_OUT_OF_BOUND); |
| 152 | |
| 153 | size_t old_size = chars.size(); |
| 154 | chars.resize(old_size + length * n); |
| 155 | memcpy(chars.data() + old_size, &src_concrete.chars[start * n], length * n); |
| 156 | } |
| 157 | |
| 158 | ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result_size_hint) const |
| 159 | { |
| 160 | size_t col_size = size(); |
| 161 | if (col_size != filt.size()) |
| 162 | throw Exception("Size of filter doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
| 163 | |
| 164 | auto res = ColumnFixedString::create(n); |
| 165 | |
| 166 | if (result_size_hint) |
| 167 | res->chars.reserve(result_size_hint > 0 ? result_size_hint * n : chars.size()); |
| 168 | |
| 169 | const UInt8 * filt_pos = filt.data(); |
| 170 | const UInt8 * filt_end = filt_pos + col_size; |
| 171 | const UInt8 * data_pos = chars.data(); |
| 172 | |
| 173 | #ifdef __SSE2__ |
| 174 | /** A slightly more optimized version. |
| 175 | * Based on the assumption that often pieces of consecutive values |
| 176 | * completely pass or do not pass the filter. |
| 177 | * Therefore, we will optimistically check the parts of `SIMD_BYTES` values. |
| 178 | */ |
| 179 | |
| 180 | static constexpr size_t SIMD_BYTES = 16; |
| 181 | const __m128i zero16 = _mm_setzero_si128(); |
| 182 | const UInt8 * filt_end_sse = filt_pos + col_size / SIMD_BYTES * SIMD_BYTES; |
| 183 | const size_t chars_per_simd_elements = SIMD_BYTES * n; |
| 184 | |
| 185 | while (filt_pos < filt_end_sse) |
| 186 | { |
| 187 | int mask = _mm_movemask_epi8(_mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i *>(filt_pos)), zero16)); |
| 188 | |
| 189 | if (0 == mask) |
| 190 | { |
| 191 | /// Nothing is inserted. |
| 192 | data_pos += chars_per_simd_elements; |
| 193 | } |
| 194 | else if (0xFFFF == mask) |
| 195 | { |
| 196 | res->chars.insert(data_pos, data_pos + chars_per_simd_elements); |
| 197 | data_pos += chars_per_simd_elements; |
| 198 | } |
| 199 | else |
| 200 | { |
| 201 | size_t res_chars_size = res->chars.size(); |
| 202 | for (size_t i = 0; i < SIMD_BYTES; ++i) |
| 203 | { |
| 204 | if (filt_pos[i]) |
| 205 | { |
| 206 | res->chars.resize(res_chars_size + n); |
| 207 | memcpySmallAllowReadWriteOverflow15(&res->chars[res_chars_size], data_pos, n); |
| 208 | res_chars_size += n; |
| 209 | } |
| 210 | data_pos += n; |
| 211 | } |
| 212 | } |
| 213 | |
| 214 | filt_pos += SIMD_BYTES; |
| 215 | } |
| 216 | #endif |
| 217 | |
| 218 | size_t res_chars_size = res->chars.size(); |
| 219 | while (filt_pos < filt_end) |
| 220 | { |
| 221 | if (*filt_pos) |
| 222 | { |
| 223 | res->chars.resize(res_chars_size + n); |
| 224 | memcpySmallAllowReadWriteOverflow15(&res->chars[res_chars_size], data_pos, n); |
| 225 | res_chars_size += n; |
| 226 | } |
| 227 | |
| 228 | ++filt_pos; |
| 229 | data_pos += n; |
| 230 | } |
| 231 | |
| 232 | return res; |
| 233 | } |
| 234 | |
| 235 | ColumnPtr ColumnFixedString::permute(const Permutation & perm, size_t limit) const |
| 236 | { |
| 237 | size_t col_size = size(); |
| 238 | |
| 239 | if (limit == 0) |
| 240 | limit = col_size; |
| 241 | else |
| 242 | limit = std::min(col_size, limit); |
| 243 | |
| 244 | if (perm.size() < limit) |
| 245 | throw Exception("Size of permutation is less than required." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
| 246 | |
| 247 | if (limit == 0) |
| 248 | return ColumnFixedString::create(n); |
| 249 | |
| 250 | auto res = ColumnFixedString::create(n); |
| 251 | |
| 252 | Chars & res_chars = res->chars; |
| 253 | |
| 254 | res_chars.resize(n * limit); |
| 255 | |
| 256 | size_t offset = 0; |
| 257 | for (size_t i = 0; i < limit; ++i, offset += n) |
| 258 | memcpySmallAllowReadWriteOverflow15(&res_chars[offset], &chars[perm[i] * n], n); |
| 259 | |
| 260 | return res; |
| 261 | } |
| 262 | |
| 263 | |
| 264 | ColumnPtr ColumnFixedString::index(const IColumn & indexes, size_t limit) const |
| 265 | { |
| 266 | return selectIndexImpl(*this, indexes, limit); |
| 267 | } |
| 268 | |
| 269 | |
| 270 | template <typename Type> |
| 271 | ColumnPtr ColumnFixedString::indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const |
| 272 | { |
| 273 | if (limit == 0) |
| 274 | return ColumnFixedString::create(n); |
| 275 | |
| 276 | auto res = ColumnFixedString::create(n); |
| 277 | |
| 278 | Chars & res_chars = res->chars; |
| 279 | |
| 280 | res_chars.resize(n * limit); |
| 281 | |
| 282 | size_t offset = 0; |
| 283 | for (size_t i = 0; i < limit; ++i, offset += n) |
| 284 | memcpySmallAllowReadWriteOverflow15(&res_chars[offset], &chars[indexes[i] * n], n); |
| 285 | |
| 286 | return res; |
| 287 | } |
| 288 | |
| 289 | ColumnPtr ColumnFixedString::replicate(const Offsets & offsets) const |
| 290 | { |
| 291 | size_t col_size = size(); |
| 292 | if (col_size != offsets.size()) |
| 293 | throw Exception("Size of offsets doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); |
| 294 | |
| 295 | auto res = ColumnFixedString::create(n); |
| 296 | |
| 297 | if (0 == col_size) |
| 298 | return res; |
| 299 | |
| 300 | Chars & res_chars = res->chars; |
| 301 | res_chars.resize(n * offsets.back()); |
| 302 | |
| 303 | Offset curr_offset = 0; |
| 304 | for (size_t i = 0; i < col_size; ++i) |
| 305 | for (size_t next_offset = offsets[i]; curr_offset < next_offset; ++curr_offset) |
| 306 | memcpySmallAllowReadWriteOverflow15(&res->chars[curr_offset * n], &chars[i * n], n); |
| 307 | |
| 308 | return res; |
| 309 | } |
| 310 | |
| 311 | void ColumnFixedString::gather(ColumnGathererStream & gatherer) |
| 312 | { |
| 313 | gatherer.gather(*this); |
| 314 | } |
| 315 | |
| 316 | void ColumnFixedString::getExtremes(Field & min, Field & max) const |
| 317 | { |
| 318 | min = String(); |
| 319 | max = String(); |
| 320 | |
| 321 | size_t col_size = size(); |
| 322 | |
| 323 | if (col_size == 0) |
| 324 | return; |
| 325 | |
| 326 | size_t min_idx = 0; |
| 327 | size_t max_idx = 0; |
| 328 | |
| 329 | less<true> less_op(*this); |
| 330 | |
| 331 | for (size_t i = 1; i < col_size; ++i) |
| 332 | { |
| 333 | if (less_op(i, min_idx)) |
| 334 | min_idx = i; |
| 335 | else if (less_op(max_idx, i)) |
| 336 | max_idx = i; |
| 337 | } |
| 338 | |
| 339 | get(min_idx, min); |
| 340 | get(max_idx, max); |
| 341 | } |
| 342 | |
| 343 | } |
| 344 | |