1#include <Functions/IFunctionImpl.h>
2#include <Functions/FunctionFactory.h>
3#include <Functions/FunctionHelpers.h>
4#include <DataTypes/DataTypeArray.h>
5#include <DataTypes/DataTypeNullable.h>
6#include <Columns/ColumnArray.h>
7#include <Columns/ColumnNullable.h>
8#include <Columns/ColumnString.h>
9#include <Common/HashTable/ClearableHashSet.h>
10#include <Common/SipHash.h>
11#include <Common/assert_cast.h>
12
13
14namespace DB
15{
16
17namespace ErrorCodes
18{
19 extern const int ILLEGAL_TYPE_OF_ARGUMENT;
20}
21
22
23/// Find different elements in an array.
24class FunctionArrayDistinct : public IFunction
25{
26public:
27 static constexpr auto name = "arrayDistinct";
28
29 static FunctionPtr create(const Context &)
30 {
31 return std::make_shared<FunctionArrayDistinct>();
32 }
33
34 String getName() const override
35 {
36 return name;
37 }
38
39 bool isVariadic() const override { return false; }
40
41 size_t getNumberOfArguments() const override { return 1; }
42
43 bool useDefaultImplementationForConstants() const override { return true; }
44
45 DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
46 {
47 const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[0].get());
48 if (!array_type)
49 throw Exception("Argument for function " + getName() + " must be array but it "
50 " has type " + arguments[0]->getName() + ".",
51 ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
52
53 auto nested_type = removeNullable(array_type->getNestedType());
54
55 return std::make_shared<DataTypeArray>(nested_type);
56 }
57
58 void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override;
59
60private:
61 /// Initially allocate a piece of memory for 512 elements. NOTE: This is just a guess.
62 static constexpr size_t INITIAL_SIZE_DEGREE = 9;
63
64 template <typename T>
65 bool executeNumber(
66 const IColumn & src_data,
67 const ColumnArray::Offsets & src_offsets,
68 IColumn & res_data_col,
69 ColumnArray::Offsets & res_offsets,
70 const ColumnNullable * nullable_col);
71
72 bool executeString(
73 const IColumn & src_data,
74 const ColumnArray::Offsets & src_offsets,
75 IColumn & res_data_col,
76 ColumnArray::Offsets & res_offsets,
77 const ColumnNullable * nullable_col);
78
79 void executeHashed(
80 const IColumn & src_data,
81 const ColumnArray::Offsets & src_offsets,
82 IColumn & res_data_col,
83 ColumnArray::Offsets & res_offsets,
84 const ColumnNullable * nullable_col);
85};
86
87
88void FunctionArrayDistinct::executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/)
89{
90 ColumnPtr array_ptr = block.getByPosition(arguments[0]).column;
91 const ColumnArray * array = checkAndGetColumn<ColumnArray>(array_ptr.get());
92
93 const auto & return_type = block.getByPosition(result).type;
94
95 auto res_ptr = return_type->createColumn();
96 ColumnArray & res = assert_cast<ColumnArray &>(*res_ptr);
97
98 const IColumn & src_data = array->getData();
99 const ColumnArray::Offsets & offsets = array->getOffsets();
100
101 IColumn & res_data = res.getData();
102 ColumnArray::Offsets & res_offsets = res.getOffsets();
103
104 const ColumnNullable * nullable_col = checkAndGetColumn<ColumnNullable>(src_data);
105
106 const IColumn * inner_col;
107
108 if (nullable_col)
109 {
110 inner_col = &nullable_col->getNestedColumn();
111 }
112 else
113 {
114 inner_col = &src_data;
115 }
116
117 if (!(executeNumber<UInt8>(*inner_col, offsets, res_data, res_offsets, nullable_col)
118 || executeNumber<UInt16>(*inner_col, offsets, res_data, res_offsets, nullable_col)
119 || executeNumber<UInt32>(*inner_col, offsets, res_data, res_offsets, nullable_col)
120 || executeNumber<UInt64>(*inner_col, offsets, res_data, res_offsets, nullable_col)
121 || executeNumber<Int8>(*inner_col, offsets, res_data, res_offsets, nullable_col)
122 || executeNumber<Int16>(*inner_col, offsets, res_data, res_offsets, nullable_col)
123 || executeNumber<Int32>(*inner_col, offsets, res_data, res_offsets, nullable_col)
124 || executeNumber<Int64>(*inner_col, offsets, res_data, res_offsets, nullable_col)
125 || executeNumber<Float32>(*inner_col, offsets, res_data, res_offsets, nullable_col)
126 || executeNumber<Float64>(*inner_col, offsets, res_data, res_offsets, nullable_col)
127 || executeString(*inner_col, offsets, res_data, res_offsets, nullable_col)))
128 executeHashed(*inner_col, offsets, res_data, res_offsets, nullable_col);
129
130 block.getByPosition(result).column = std::move(res_ptr);
131}
132
133template <typename T>
134bool FunctionArrayDistinct::executeNumber(
135 const IColumn & src_data,
136 const ColumnArray::Offsets & src_offsets,
137 IColumn & res_data_col,
138 ColumnArray::Offsets & res_offsets,
139 const ColumnNullable * nullable_col)
140{
141 const ColumnVector<T> * src_data_concrete = checkAndGetColumn<ColumnVector<T>>(&src_data);
142
143 if (!src_data_concrete)
144 {
145 return false;
146 }
147
148 const PaddedPODArray<T> & values = src_data_concrete->getData();
149 PaddedPODArray<T> & res_data = typeid_cast<ColumnVector<T> &>(res_data_col).getData();
150
151 const PaddedPODArray<UInt8> * src_null_map = nullptr;
152
153 if (nullable_col)
154 src_null_map = &nullable_col->getNullMapData();
155
156 using Set = ClearableHashSet<T,
157 DefaultHash<T>,
158 HashTableGrower<INITIAL_SIZE_DEGREE>,
159 HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(T)>>;
160
161 Set set;
162
163 ColumnArray::Offset prev_src_offset = 0;
164 ColumnArray::Offset res_offset = 0;
165
166 for (ColumnArray::Offset i = 0; i < src_offsets.size(); ++i)
167 {
168 set.clear();
169
170 ColumnArray::Offset curr_src_offset = src_offsets[i];
171 for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j)
172 {
173 if (nullable_col && (*src_null_map)[j])
174 continue;
175
176 if (!set.find(values[j]))
177 {
178 res_data.emplace_back(values[j]);
179 set.insert(values[j]);
180 }
181 }
182
183 res_offset += set.size();
184 res_offsets.emplace_back(res_offset);
185
186 prev_src_offset = curr_src_offset;
187 }
188 return true;
189}
190
191bool FunctionArrayDistinct::executeString(
192 const IColumn & src_data,
193 const ColumnArray::Offsets & src_offsets,
194 IColumn & res_data_col,
195 ColumnArray::Offsets & res_offsets,
196 const ColumnNullable * nullable_col)
197{
198 const ColumnString * src_data_concrete = checkAndGetColumn<ColumnString>(&src_data);
199
200 if (!src_data_concrete)
201 return false;
202
203 ColumnString & res_data_column_string = typeid_cast<ColumnString &>(res_data_col);
204
205 using Set = ClearableHashSet<StringRef,
206 StringRefHash,
207 HashTableGrower<INITIAL_SIZE_DEGREE>,
208 HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(StringRef)>>;
209
210 const PaddedPODArray<UInt8> * src_null_map = nullptr;
211
212 if (nullable_col)
213 src_null_map = &nullable_col->getNullMapData();
214
215 Set set;
216
217 ColumnArray::Offset prev_src_offset = 0;
218 ColumnArray::Offset res_offset = 0;
219
220 for (ColumnArray::Offset i = 0; i < src_offsets.size(); ++i)
221 {
222 set.clear();
223
224 ColumnArray::Offset curr_src_offset = src_offsets[i];
225 for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j)
226 {
227 if (nullable_col && (*src_null_map)[j])
228 continue;
229
230 StringRef str_ref = src_data_concrete->getDataAt(j);
231
232 if (!set.find(str_ref))
233 {
234 set.insert(str_ref);
235 res_data_column_string.insertData(str_ref.data, str_ref.size);
236 }
237 }
238
239 res_offset += set.size();
240 res_offsets.emplace_back(res_offset);
241
242 prev_src_offset = curr_src_offset;
243 }
244 return true;
245}
246
247void FunctionArrayDistinct::executeHashed(
248 const IColumn & src_data,
249 const ColumnArray::Offsets & src_offsets,
250 IColumn & res_data_col,
251 ColumnArray::Offsets & res_offsets,
252 const ColumnNullable * nullable_col)
253{
254 using Set = ClearableHashSet<UInt128, UInt128TrivialHash, HashTableGrower<INITIAL_SIZE_DEGREE>,
255 HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(UInt128)>>;
256
257 const PaddedPODArray<UInt8> * src_null_map = nullptr;
258
259 if (nullable_col)
260 src_null_map = &nullable_col->getNullMapData();
261
262 Set set;
263
264 ColumnArray::Offset prev_src_offset = 0;
265 ColumnArray::Offset res_offset = 0;
266
267 for (ColumnArray::Offset i = 0; i < src_offsets.size(); ++i)
268 {
269 set.clear();
270
271 ColumnArray::Offset curr_src_offset = src_offsets[i];
272 for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j)
273 {
274 if (nullable_col && (*src_null_map)[j])
275 continue;
276
277 UInt128 hash;
278 SipHash hash_function;
279 src_data.updateHashWithValue(j, hash_function);
280 hash_function.get128(reinterpret_cast<char *>(&hash));
281
282 if (!set.find(hash))
283 {
284 set.insert(hash);
285 res_data_col.insertFrom(src_data, j);
286 }
287 }
288
289 res_offset += set.size();
290 res_offsets.emplace_back(res_offset);
291
292 prev_src_offset = curr_src_offset;
293 }
294}
295
296
297void registerFunctionArrayDistinct(FunctionFactory & factory)
298{
299 factory.registerFunction<FunctionArrayDistinct>();
300}
301
302}
303