| 1 | #pragma once |
| 2 | |
| 3 | #include <cstring> |
| 4 | #include <cassert> |
| 5 | |
| 6 | #include <Columns/IColumn.h> |
| 7 | #include <Columns/IColumnImpl.h> |
| 8 | #include <Common/PODArray.h> |
| 9 | #include <Common/SipHash.h> |
| 10 | #include <Common/memcpySmall.h> |
| 11 | #include <Common/memcmpSmall.h> |
| 12 | #include <Common/assert_cast.h> |
| 13 | #include <Core/Field.h> |
| 14 | |
| 15 | |
| 16 | class Collator; |
| 17 | |
| 18 | |
| 19 | namespace DB |
| 20 | { |
| 21 | |
| 22 | /** Column for String values. |
| 23 | */ |
| 24 | class ColumnString final : public COWHelper<IColumn, ColumnString> |
| 25 | { |
| 26 | public: |
| 27 | using Char = UInt8; |
| 28 | using Chars = PaddedPODArray<UInt8>; |
| 29 | |
| 30 | private: |
| 31 | friend class COWHelper<IColumn, ColumnString>; |
| 32 | |
| 33 | /// Maps i'th position to offset to i+1'th element. Last offset maps to the end of all chars (is the size of all chars). |
| 34 | Offsets offsets; |
| 35 | |
| 36 | /// Bytes of strings, placed contiguously. |
| 37 | /// For convenience, every string ends with terminating zero byte. Note that strings could contain zero bytes in the middle. |
| 38 | Chars chars; |
| 39 | |
| 40 | size_t ALWAYS_INLINE offsetAt(ssize_t i) const { return offsets[i - 1]; } |
| 41 | |
| 42 | /// Size of i-th element, including terminating zero. |
| 43 | size_t ALWAYS_INLINE sizeAt(ssize_t i) const { return offsets[i] - offsets[i - 1]; } |
| 44 | |
| 45 | template <bool positive> |
| 46 | struct less; |
| 47 | |
| 48 | template <bool positive> |
| 49 | struct lessWithCollation; |
| 50 | |
| 51 | ColumnString() = default; |
| 52 | |
| 53 | ColumnString(const ColumnString & src) |
| 54 | : offsets(src.offsets.begin(), src.offsets.end()), |
| 55 | chars(src.chars.begin(), src.chars.end()) {} |
| 56 | |
| 57 | public: |
| 58 | const char * getFamilyName() const override { return "String" ; } |
| 59 | |
| 60 | size_t size() const override |
| 61 | { |
| 62 | return offsets.size(); |
| 63 | } |
| 64 | |
| 65 | size_t byteSize() const override |
| 66 | { |
| 67 | return chars.size() + offsets.size() * sizeof(offsets[0]); |
| 68 | } |
| 69 | |
| 70 | size_t allocatedBytes() const override |
| 71 | { |
| 72 | return chars.allocated_bytes() + offsets.allocated_bytes(); |
| 73 | } |
| 74 | |
| 75 | void protect() override; |
| 76 | |
| 77 | MutableColumnPtr cloneResized(size_t to_size) const override; |
| 78 | |
| 79 | Field operator[](size_t n) const override |
| 80 | { |
| 81 | assert(n < size()); |
| 82 | return Field(&chars[offsetAt(n)], sizeAt(n) - 1); |
| 83 | } |
| 84 | |
| 85 | void get(size_t n, Field & res) const override |
| 86 | { |
| 87 | assert(n < size()); |
| 88 | res.assignString(&chars[offsetAt(n)], sizeAt(n) - 1); |
| 89 | } |
| 90 | |
| 91 | StringRef getDataAt(size_t n) const override |
| 92 | { |
| 93 | assert(n < size()); |
| 94 | return StringRef(&chars[offsetAt(n)], sizeAt(n) - 1); |
| 95 | } |
| 96 | |
| 97 | StringRef getDataAtWithTerminatingZero(size_t n) const override |
| 98 | { |
| 99 | assert(n < size()); |
| 100 | return StringRef(&chars[offsetAt(n)], sizeAt(n)); |
| 101 | } |
| 102 | |
| 103 | /// Suppress gcc 7.3.1 warning: '*((void*)&<anonymous> +8)' may be used uninitialized in this function |
| 104 | #if !__clang__ |
| 105 | #pragma GCC diagnostic push |
| 106 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" |
| 107 | #endif |
| 108 | |
| 109 | void insert(const Field & x) override |
| 110 | { |
| 111 | const String & s = DB::get<const String &>(x); |
| 112 | const size_t old_size = chars.size(); |
| 113 | const size_t size_to_append = s.size() + 1; |
| 114 | const size_t new_size = old_size + size_to_append; |
| 115 | |
| 116 | chars.resize(new_size); |
| 117 | memcpy(chars.data() + old_size, s.c_str(), size_to_append); |
| 118 | offsets.push_back(new_size); |
| 119 | } |
| 120 | |
| 121 | #if !__clang__ |
| 122 | #pragma GCC diagnostic pop |
| 123 | #endif |
| 124 | |
| 125 | void insertFrom(const IColumn & src_, size_t n) override |
| 126 | { |
| 127 | const ColumnString & src = assert_cast<const ColumnString &>(src_); |
| 128 | const size_t size_to_append = src.offsets[n] - src.offsets[n - 1]; /// -1th index is Ok, see PaddedPODArray. |
| 129 | |
| 130 | if (size_to_append == 1) |
| 131 | { |
| 132 | /// shortcut for empty string |
| 133 | chars.push_back(0); |
| 134 | offsets.push_back(chars.size()); |
| 135 | } |
| 136 | else |
| 137 | { |
| 138 | const size_t old_size = chars.size(); |
| 139 | const size_t offset = src.offsets[n - 1]; |
| 140 | const size_t new_size = old_size + size_to_append; |
| 141 | |
| 142 | chars.resize(new_size); |
| 143 | memcpySmallAllowReadWriteOverflow15(chars.data() + old_size, &src.chars[offset], size_to_append); |
| 144 | offsets.push_back(new_size); |
| 145 | } |
| 146 | } |
| 147 | |
| 148 | void insertData(const char * pos, size_t length) override |
| 149 | { |
| 150 | const size_t old_size = chars.size(); |
| 151 | const size_t new_size = old_size + length + 1; |
| 152 | |
| 153 | chars.resize(new_size); |
| 154 | if (length) |
| 155 | memcpy(chars.data() + old_size, pos, length); |
| 156 | chars[old_size + length] = 0; |
| 157 | offsets.push_back(new_size); |
| 158 | } |
| 159 | |
| 160 | /// Like getData, but inserting data should be zero-ending (i.e. length is 1 byte greater than real string size). |
| 161 | void insertDataWithTerminatingZero(const char * pos, size_t length) |
| 162 | { |
| 163 | const size_t old_size = chars.size(); |
| 164 | const size_t new_size = old_size + length; |
| 165 | |
| 166 | chars.resize(new_size); |
| 167 | memcpy(chars.data() + old_size, pos, length); |
| 168 | offsets.push_back(new_size); |
| 169 | } |
| 170 | |
| 171 | void popBack(size_t n) override |
| 172 | { |
| 173 | size_t nested_n = offsets.back() - offsetAt(offsets.size() - n); |
| 174 | chars.resize(chars.size() - nested_n); |
| 175 | offsets.resize_assume_reserved(offsets.size() - n); |
| 176 | } |
| 177 | |
| 178 | StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override; |
| 179 | |
| 180 | const char * deserializeAndInsertFromArena(const char * pos) override; |
| 181 | |
| 182 | void updateHashWithValue(size_t n, SipHash & hash) const override |
| 183 | { |
| 184 | size_t string_size = sizeAt(n); |
| 185 | size_t offset = offsetAt(n); |
| 186 | |
| 187 | hash.update(reinterpret_cast<const char *>(&string_size), sizeof(string_size)); |
| 188 | hash.update(reinterpret_cast<const char *>(&chars[offset]), string_size); |
| 189 | } |
| 190 | |
| 191 | void insertRangeFrom(const IColumn & src, size_t start, size_t length) override; |
| 192 | |
| 193 | ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override; |
| 194 | |
| 195 | ColumnPtr permute(const Permutation & perm, size_t limit) const override; |
| 196 | |
| 197 | ColumnPtr index(const IColumn & indexes, size_t limit) const override; |
| 198 | |
| 199 | template <typename Type> |
| 200 | ColumnPtr indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const; |
| 201 | |
| 202 | void insertDefault() override |
| 203 | { |
| 204 | chars.push_back(0); |
| 205 | offsets.push_back(offsets.back() + 1); |
| 206 | } |
| 207 | |
| 208 | virtual void insertManyDefaults(size_t length) override |
| 209 | { |
| 210 | chars.resize_fill(chars.size() + length); |
| 211 | for (size_t i = 0; i < length; ++i) |
| 212 | offsets.push_back(offsets.back() + 1); |
| 213 | } |
| 214 | |
| 215 | int compareAt(size_t n, size_t m, const IColumn & rhs_, int /*nan_direction_hint*/) const override |
| 216 | { |
| 217 | const ColumnString & rhs = assert_cast<const ColumnString &>(rhs_); |
| 218 | return memcmpSmallAllowOverflow15(chars.data() + offsetAt(n), sizeAt(n) - 1, rhs.chars.data() + rhs.offsetAt(m), rhs.sizeAt(m) - 1); |
| 219 | } |
| 220 | |
| 221 | /// Variant of compareAt for string comparison with respect of collation. |
| 222 | int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, const Collator & collator) const; |
| 223 | |
| 224 | void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override; |
| 225 | |
| 226 | /// Sorting with respect of collation. |
| 227 | void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, Permutation & res) const; |
| 228 | |
| 229 | ColumnPtr replicate(const Offsets & replicate_offsets) const override; |
| 230 | |
| 231 | MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override |
| 232 | { |
| 233 | return scatterImpl<ColumnString>(num_columns, selector); |
| 234 | } |
| 235 | |
| 236 | void gather(ColumnGathererStream & gatherer_stream) override; |
| 237 | |
| 238 | void reserve(size_t n) override; |
| 239 | |
| 240 | void getExtremes(Field & min, Field & max) const override; |
| 241 | |
| 242 | |
| 243 | bool canBeInsideNullable() const override { return true; } |
| 244 | |
| 245 | bool structureEquals(const IColumn & rhs) const override |
| 246 | { |
| 247 | return typeid(rhs) == typeid(ColumnString); |
| 248 | } |
| 249 | |
| 250 | |
| 251 | Chars & getChars() { return chars; } |
| 252 | const Chars & getChars() const { return chars; } |
| 253 | |
| 254 | Offsets & getOffsets() { return offsets; } |
| 255 | const Offsets & getOffsets() const { return offsets; } |
| 256 | }; |
| 257 | |
| 258 | |
| 259 | } |
| 260 | |