1 | #pragma once |
2 | |
3 | #include <cstring> |
4 | #include <cassert> |
5 | |
6 | #include <Columns/IColumn.h> |
7 | #include <Columns/IColumnImpl.h> |
8 | #include <Common/PODArray.h> |
9 | #include <Common/SipHash.h> |
10 | #include <Common/memcpySmall.h> |
11 | #include <Common/memcmpSmall.h> |
12 | #include <Common/assert_cast.h> |
13 | #include <Core/Field.h> |
14 | |
15 | |
16 | class Collator; |
17 | |
18 | |
19 | namespace DB |
20 | { |
21 | |
22 | /** Column for String values. |
23 | */ |
24 | class ColumnString final : public COWHelper<IColumn, ColumnString> |
25 | { |
26 | public: |
27 | using Char = UInt8; |
28 | using Chars = PaddedPODArray<UInt8>; |
29 | |
30 | private: |
31 | friend class COWHelper<IColumn, ColumnString>; |
32 | |
33 | /// Maps i'th position to offset to i+1'th element. Last offset maps to the end of all chars (is the size of all chars). |
34 | Offsets offsets; |
35 | |
36 | /// Bytes of strings, placed contiguously. |
37 | /// For convenience, every string ends with terminating zero byte. Note that strings could contain zero bytes in the middle. |
38 | Chars chars; |
39 | |
40 | size_t ALWAYS_INLINE offsetAt(ssize_t i) const { return offsets[i - 1]; } |
41 | |
42 | /// Size of i-th element, including terminating zero. |
43 | size_t ALWAYS_INLINE sizeAt(ssize_t i) const { return offsets[i] - offsets[i - 1]; } |
44 | |
45 | template <bool positive> |
46 | struct less; |
47 | |
48 | template <bool positive> |
49 | struct lessWithCollation; |
50 | |
51 | ColumnString() = default; |
52 | |
53 | ColumnString(const ColumnString & src) |
54 | : offsets(src.offsets.begin(), src.offsets.end()), |
55 | chars(src.chars.begin(), src.chars.end()) {} |
56 | |
57 | public: |
58 | const char * getFamilyName() const override { return "String" ; } |
59 | |
60 | size_t size() const override |
61 | { |
62 | return offsets.size(); |
63 | } |
64 | |
65 | size_t byteSize() const override |
66 | { |
67 | return chars.size() + offsets.size() * sizeof(offsets[0]); |
68 | } |
69 | |
70 | size_t allocatedBytes() const override |
71 | { |
72 | return chars.allocated_bytes() + offsets.allocated_bytes(); |
73 | } |
74 | |
75 | void protect() override; |
76 | |
77 | MutableColumnPtr cloneResized(size_t to_size) const override; |
78 | |
79 | Field operator[](size_t n) const override |
80 | { |
81 | assert(n < size()); |
82 | return Field(&chars[offsetAt(n)], sizeAt(n) - 1); |
83 | } |
84 | |
85 | void get(size_t n, Field & res) const override |
86 | { |
87 | assert(n < size()); |
88 | res.assignString(&chars[offsetAt(n)], sizeAt(n) - 1); |
89 | } |
90 | |
91 | StringRef getDataAt(size_t n) const override |
92 | { |
93 | assert(n < size()); |
94 | return StringRef(&chars[offsetAt(n)], sizeAt(n) - 1); |
95 | } |
96 | |
97 | StringRef getDataAtWithTerminatingZero(size_t n) const override |
98 | { |
99 | assert(n < size()); |
100 | return StringRef(&chars[offsetAt(n)], sizeAt(n)); |
101 | } |
102 | |
103 | /// Suppress gcc 7.3.1 warning: '*((void*)&<anonymous> +8)' may be used uninitialized in this function |
104 | #if !__clang__ |
105 | #pragma GCC diagnostic push |
106 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" |
107 | #endif |
108 | |
109 | void insert(const Field & x) override |
110 | { |
111 | const String & s = DB::get<const String &>(x); |
112 | const size_t old_size = chars.size(); |
113 | const size_t size_to_append = s.size() + 1; |
114 | const size_t new_size = old_size + size_to_append; |
115 | |
116 | chars.resize(new_size); |
117 | memcpy(chars.data() + old_size, s.c_str(), size_to_append); |
118 | offsets.push_back(new_size); |
119 | } |
120 | |
121 | #if !__clang__ |
122 | #pragma GCC diagnostic pop |
123 | #endif |
124 | |
125 | void insertFrom(const IColumn & src_, size_t n) override |
126 | { |
127 | const ColumnString & src = assert_cast<const ColumnString &>(src_); |
128 | const size_t size_to_append = src.offsets[n] - src.offsets[n - 1]; /// -1th index is Ok, see PaddedPODArray. |
129 | |
130 | if (size_to_append == 1) |
131 | { |
132 | /// shortcut for empty string |
133 | chars.push_back(0); |
134 | offsets.push_back(chars.size()); |
135 | } |
136 | else |
137 | { |
138 | const size_t old_size = chars.size(); |
139 | const size_t offset = src.offsets[n - 1]; |
140 | const size_t new_size = old_size + size_to_append; |
141 | |
142 | chars.resize(new_size); |
143 | memcpySmallAllowReadWriteOverflow15(chars.data() + old_size, &src.chars[offset], size_to_append); |
144 | offsets.push_back(new_size); |
145 | } |
146 | } |
147 | |
148 | void insertData(const char * pos, size_t length) override |
149 | { |
150 | const size_t old_size = chars.size(); |
151 | const size_t new_size = old_size + length + 1; |
152 | |
153 | chars.resize(new_size); |
154 | if (length) |
155 | memcpy(chars.data() + old_size, pos, length); |
156 | chars[old_size + length] = 0; |
157 | offsets.push_back(new_size); |
158 | } |
159 | |
160 | /// Like getData, but inserting data should be zero-ending (i.e. length is 1 byte greater than real string size). |
161 | void insertDataWithTerminatingZero(const char * pos, size_t length) |
162 | { |
163 | const size_t old_size = chars.size(); |
164 | const size_t new_size = old_size + length; |
165 | |
166 | chars.resize(new_size); |
167 | memcpy(chars.data() + old_size, pos, length); |
168 | offsets.push_back(new_size); |
169 | } |
170 | |
171 | void popBack(size_t n) override |
172 | { |
173 | size_t nested_n = offsets.back() - offsetAt(offsets.size() - n); |
174 | chars.resize(chars.size() - nested_n); |
175 | offsets.resize_assume_reserved(offsets.size() - n); |
176 | } |
177 | |
178 | StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override; |
179 | |
180 | const char * deserializeAndInsertFromArena(const char * pos) override; |
181 | |
182 | void updateHashWithValue(size_t n, SipHash & hash) const override |
183 | { |
184 | size_t string_size = sizeAt(n); |
185 | size_t offset = offsetAt(n); |
186 | |
187 | hash.update(reinterpret_cast<const char *>(&string_size), sizeof(string_size)); |
188 | hash.update(reinterpret_cast<const char *>(&chars[offset]), string_size); |
189 | } |
190 | |
191 | void insertRangeFrom(const IColumn & src, size_t start, size_t length) override; |
192 | |
193 | ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override; |
194 | |
195 | ColumnPtr permute(const Permutation & perm, size_t limit) const override; |
196 | |
197 | ColumnPtr index(const IColumn & indexes, size_t limit) const override; |
198 | |
199 | template <typename Type> |
200 | ColumnPtr indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const; |
201 | |
202 | void insertDefault() override |
203 | { |
204 | chars.push_back(0); |
205 | offsets.push_back(offsets.back() + 1); |
206 | } |
207 | |
208 | virtual void insertManyDefaults(size_t length) override |
209 | { |
210 | chars.resize_fill(chars.size() + length); |
211 | for (size_t i = 0; i < length; ++i) |
212 | offsets.push_back(offsets.back() + 1); |
213 | } |
214 | |
215 | int compareAt(size_t n, size_t m, const IColumn & rhs_, int /*nan_direction_hint*/) const override |
216 | { |
217 | const ColumnString & rhs = assert_cast<const ColumnString &>(rhs_); |
218 | return memcmpSmallAllowOverflow15(chars.data() + offsetAt(n), sizeAt(n) - 1, rhs.chars.data() + rhs.offsetAt(m), rhs.sizeAt(m) - 1); |
219 | } |
220 | |
221 | /// Variant of compareAt for string comparison with respect of collation. |
222 | int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, const Collator & collator) const; |
223 | |
224 | void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override; |
225 | |
226 | /// Sorting with respect of collation. |
227 | void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, Permutation & res) const; |
228 | |
229 | ColumnPtr replicate(const Offsets & replicate_offsets) const override; |
230 | |
231 | MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override |
232 | { |
233 | return scatterImpl<ColumnString>(num_columns, selector); |
234 | } |
235 | |
236 | void gather(ColumnGathererStream & gatherer_stream) override; |
237 | |
238 | void reserve(size_t n) override; |
239 | |
240 | void getExtremes(Field & min, Field & max) const override; |
241 | |
242 | |
243 | bool canBeInsideNullable() const override { return true; } |
244 | |
245 | bool structureEquals(const IColumn & rhs) const override |
246 | { |
247 | return typeid(rhs) == typeid(ColumnString); |
248 | } |
249 | |
250 | |
251 | Chars & getChars() { return chars; } |
252 | const Chars & getChars() const { return chars; } |
253 | |
254 | Offsets & getOffsets() { return offsets; } |
255 | const Offsets & getOffsets() const { return offsets; } |
256 | }; |
257 | |
258 | |
259 | } |
260 | |