1#pragma once
2
3#include <cstring>
4#include <cassert>
5
6#include <Columns/IColumn.h>
7#include <Columns/IColumnImpl.h>
8#include <Common/PODArray.h>
9#include <Common/SipHash.h>
10#include <Common/memcpySmall.h>
11#include <Common/memcmpSmall.h>
12#include <Common/assert_cast.h>
13#include <Core/Field.h>
14
15
16class Collator;
17
18
19namespace DB
20{
21
22/** Column for String values.
23 */
24class ColumnString final : public COWHelper<IColumn, ColumnString>
25{
26public:
27 using Char = UInt8;
28 using Chars = PaddedPODArray<UInt8>;
29
30private:
31 friend class COWHelper<IColumn, ColumnString>;
32
33 /// Maps i'th position to offset to i+1'th element. Last offset maps to the end of all chars (is the size of all chars).
34 Offsets offsets;
35
36 /// Bytes of strings, placed contiguously.
37 /// For convenience, every string ends with terminating zero byte. Note that strings could contain zero bytes in the middle.
38 Chars chars;
39
40 size_t ALWAYS_INLINE offsetAt(ssize_t i) const { return offsets[i - 1]; }
41
42 /// Size of i-th element, including terminating zero.
43 size_t ALWAYS_INLINE sizeAt(ssize_t i) const { return offsets[i] - offsets[i - 1]; }
44
45 template <bool positive>
46 struct less;
47
48 template <bool positive>
49 struct lessWithCollation;
50
51 ColumnString() = default;
52
53 ColumnString(const ColumnString & src)
54 : offsets(src.offsets.begin(), src.offsets.end()),
55 chars(src.chars.begin(), src.chars.end()) {}
56
57public:
58 const char * getFamilyName() const override { return "String"; }
59
60 size_t size() const override
61 {
62 return offsets.size();
63 }
64
65 size_t byteSize() const override
66 {
67 return chars.size() + offsets.size() * sizeof(offsets[0]);
68 }
69
70 size_t allocatedBytes() const override
71 {
72 return chars.allocated_bytes() + offsets.allocated_bytes();
73 }
74
75 void protect() override;
76
77 MutableColumnPtr cloneResized(size_t to_size) const override;
78
79 Field operator[](size_t n) const override
80 {
81 assert(n < size());
82 return Field(&chars[offsetAt(n)], sizeAt(n) - 1);
83 }
84
85 void get(size_t n, Field & res) const override
86 {
87 assert(n < size());
88 res.assignString(&chars[offsetAt(n)], sizeAt(n) - 1);
89 }
90
91 StringRef getDataAt(size_t n) const override
92 {
93 assert(n < size());
94 return StringRef(&chars[offsetAt(n)], sizeAt(n) - 1);
95 }
96
97 StringRef getDataAtWithTerminatingZero(size_t n) const override
98 {
99 assert(n < size());
100 return StringRef(&chars[offsetAt(n)], sizeAt(n));
101 }
102
103/// Suppress gcc 7.3.1 warning: '*((void*)&<anonymous> +8)' may be used uninitialized in this function
104#if !__clang__
105#pragma GCC diagnostic push
106#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
107#endif
108
109 void insert(const Field & x) override
110 {
111 const String & s = DB::get<const String &>(x);
112 const size_t old_size = chars.size();
113 const size_t size_to_append = s.size() + 1;
114 const size_t new_size = old_size + size_to_append;
115
116 chars.resize(new_size);
117 memcpy(chars.data() + old_size, s.c_str(), size_to_append);
118 offsets.push_back(new_size);
119 }
120
121#if !__clang__
122#pragma GCC diagnostic pop
123#endif
124
125 void insertFrom(const IColumn & src_, size_t n) override
126 {
127 const ColumnString & src = assert_cast<const ColumnString &>(src_);
128 const size_t size_to_append = src.offsets[n] - src.offsets[n - 1]; /// -1th index is Ok, see PaddedPODArray.
129
130 if (size_to_append == 1)
131 {
132 /// shortcut for empty string
133 chars.push_back(0);
134 offsets.push_back(chars.size());
135 }
136 else
137 {
138 const size_t old_size = chars.size();
139 const size_t offset = src.offsets[n - 1];
140 const size_t new_size = old_size + size_to_append;
141
142 chars.resize(new_size);
143 memcpySmallAllowReadWriteOverflow15(chars.data() + old_size, &src.chars[offset], size_to_append);
144 offsets.push_back(new_size);
145 }
146 }
147
148 void insertData(const char * pos, size_t length) override
149 {
150 const size_t old_size = chars.size();
151 const size_t new_size = old_size + length + 1;
152
153 chars.resize(new_size);
154 if (length)
155 memcpy(chars.data() + old_size, pos, length);
156 chars[old_size + length] = 0;
157 offsets.push_back(new_size);
158 }
159
160 /// Like getData, but inserting data should be zero-ending (i.e. length is 1 byte greater than real string size).
161 void insertDataWithTerminatingZero(const char * pos, size_t length)
162 {
163 const size_t old_size = chars.size();
164 const size_t new_size = old_size + length;
165
166 chars.resize(new_size);
167 memcpy(chars.data() + old_size, pos, length);
168 offsets.push_back(new_size);
169 }
170
171 void popBack(size_t n) override
172 {
173 size_t nested_n = offsets.back() - offsetAt(offsets.size() - n);
174 chars.resize(chars.size() - nested_n);
175 offsets.resize_assume_reserved(offsets.size() - n);
176 }
177
178 StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
179
180 const char * deserializeAndInsertFromArena(const char * pos) override;
181
182 void updateHashWithValue(size_t n, SipHash & hash) const override
183 {
184 size_t string_size = sizeAt(n);
185 size_t offset = offsetAt(n);
186
187 hash.update(reinterpret_cast<const char *>(&string_size), sizeof(string_size));
188 hash.update(reinterpret_cast<const char *>(&chars[offset]), string_size);
189 }
190
191 void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
192
193 ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
194
195 ColumnPtr permute(const Permutation & perm, size_t limit) const override;
196
197 ColumnPtr index(const IColumn & indexes, size_t limit) const override;
198
199 template <typename Type>
200 ColumnPtr indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const;
201
202 void insertDefault() override
203 {
204 chars.push_back(0);
205 offsets.push_back(offsets.back() + 1);
206 }
207
208 virtual void insertManyDefaults(size_t length) override
209 {
210 chars.resize_fill(chars.size() + length);
211 for (size_t i = 0; i < length; ++i)
212 offsets.push_back(offsets.back() + 1);
213 }
214
215 int compareAt(size_t n, size_t m, const IColumn & rhs_, int /*nan_direction_hint*/) const override
216 {
217 const ColumnString & rhs = assert_cast<const ColumnString &>(rhs_);
218 return memcmpSmallAllowOverflow15(chars.data() + offsetAt(n), sizeAt(n) - 1, rhs.chars.data() + rhs.offsetAt(m), rhs.sizeAt(m) - 1);
219 }
220
221 /// Variant of compareAt for string comparison with respect of collation.
222 int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, const Collator & collator) const;
223
224 void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
225
226 /// Sorting with respect of collation.
227 void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, Permutation & res) const;
228
229 ColumnPtr replicate(const Offsets & replicate_offsets) const override;
230
231 MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override
232 {
233 return scatterImpl<ColumnString>(num_columns, selector);
234 }
235
236 void gather(ColumnGathererStream & gatherer_stream) override;
237
238 void reserve(size_t n) override;
239
240 void getExtremes(Field & min, Field & max) const override;
241
242
243 bool canBeInsideNullable() const override { return true; }
244
245 bool structureEquals(const IColumn & rhs) const override
246 {
247 return typeid(rhs) == typeid(ColumnString);
248 }
249
250
251 Chars & getChars() { return chars; }
252 const Chars & getChars() const { return chars; }
253
254 Offsets & getOffsets() { return offsets; }
255 const Offsets & getOffsets() const { return offsets; }
256};
257
258
259}
260