1#include <Functions/IFunctionImpl.h>
2#include <Functions/FunctionFactory.h>
3#include <Functions/FunctionHelpers.h>
4#include <DataTypes/DataTypeArray.h>
5#include <DataTypes/DataTypesNumber.h>
6#include <Columns/ColumnArray.h>
7#include <Columns/ColumnConst.h>
8#include <Columns/ColumnNullable.h>
9#include <Columns/ColumnString.h>
10#include <Common/HashTable/ClearableHashSet.h>
11#include <Common/ColumnsHashing.h>
12#include <Interpreters/AggregationCommon.h>
13#include <IO/WriteHelpers.h>
14
15
16namespace DB
17{
18
19namespace ErrorCodes
20{
21 extern const int SIZES_OF_ARRAYS_DOESNT_MATCH;
22 extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
23 extern const int ILLEGAL_COLUMN;
24 extern const int ILLEGAL_TYPE_OF_ARGUMENT;
25}
26
27/// Counts the number of different elements in the array, or the number of different tuples from the elements at the corresponding positions in several arrays.
28/// NOTE The implementation partially matches arrayEnumerateUniq.
29class FunctionArrayUniq : public IFunction
30{
31public:
32 static constexpr auto name = "arrayUniq";
33
34 static FunctionPtr create(const Context &) { return std::make_shared<FunctionArrayUniq>(); }
35
36 String getName() const override { return name; }
37
38 bool isVariadic() const override { return true; }
39 size_t getNumberOfArguments() const override { return 0; }
40 bool useDefaultImplementationForConstants() const override { return true; }
41
42 DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
43 {
44 if (arguments.size() == 0)
45 throw Exception("Number of arguments for function " + getName() + " doesn't match: passed "
46 + toString(arguments.size()) + ", should be at least 1.",
47 ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
48
49 for (size_t i = 0; i < arguments.size(); ++i)
50 {
51 const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[i].get());
52 if (!array_type)
53 throw Exception("All arguments for function " + getName() + " must be arrays but argument " +
54 toString(i + 1) + " has type " + arguments[i]->getName() + ".", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
55 }
56
57 return std::make_shared<DataTypeUInt32>();
58 }
59
60 void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override;
61
62private:
63 /// Initially allocate a piece of memory for 512 elements. NOTE: This is just a guess.
64 static constexpr size_t INITIAL_SIZE_DEGREE = 9;
65
66 template <typename T>
67 struct MethodOneNumber
68 {
69 using Set = ClearableHashSet<T, DefaultHash<T>, HashTableGrower<INITIAL_SIZE_DEGREE>,
70 HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(T)>>;
71 using Method = ColumnsHashing::HashMethodOneNumber<typename Set::value_type, void, T, false>;
72 };
73
74 struct MethodString
75 {
76 using Set = ClearableHashSet<StringRef, StringRefHash, HashTableGrower<INITIAL_SIZE_DEGREE>,
77 HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(StringRef)>>;
78 using Method = ColumnsHashing::HashMethodString<typename Set::value_type, void, false, false>;
79 };
80
81 struct MethodFixedString
82 {
83 using Set = ClearableHashSet<StringRef, StringRefHash, HashTableGrower<INITIAL_SIZE_DEGREE>,
84 HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(StringRef)>>;
85 using Method = ColumnsHashing::HashMethodFixedString<typename Set::value_type, void, false, false>;
86 };
87
88 struct MethodFixed
89 {
90 using Set = ClearableHashSet<UInt128, UInt128HashCRC32, HashTableGrower<INITIAL_SIZE_DEGREE>,
91 HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(UInt128)>>;
92 using Method = ColumnsHashing::HashMethodKeysFixed<typename Set::value_type, UInt128, void, false, false, false>;
93 };
94
95 struct MethodHashed
96 {
97 using Set = ClearableHashSet<UInt128, UInt128TrivialHash, HashTableGrower<INITIAL_SIZE_DEGREE>,
98 HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(UInt128)>>;
99 using Method = ColumnsHashing::HashMethodHashed<typename Set::value_type, void, false>;
100 };
101
102 template <typename Method>
103 void executeMethod(const ColumnArray::Offsets & offsets, const ColumnRawPtrs & columns, const Sizes & key_sizes,
104 const NullMap * null_map, ColumnUInt32::Container & res_values);
105
106 template <typename Method, bool has_null_map>
107 void executeMethodImpl(const ColumnArray::Offsets & offsets, const ColumnRawPtrs & columns, const Sizes & key_sizes,
108 const NullMap * null_map, ColumnUInt32::Container & res_values);
109
110 template <typename T>
111 bool executeNumber(const ColumnArray::Offsets & offsets, const IColumn & data, const NullMap * null_map, ColumnUInt32::Container & res_values);
112 bool executeString(const ColumnArray::Offsets & offsets, const IColumn & data, const NullMap * null_map, ColumnUInt32::Container & res_values);
113 bool executeFixedString(const ColumnArray::Offsets & offsets, const IColumn & data, const NullMap * null_map, ColumnUInt32::Container & res_values);
114 bool execute128bit(const ColumnArray::Offsets & offsets, const ColumnRawPtrs & columns, ColumnUInt32::Container & res_values);
115 void executeHashed(const ColumnArray::Offsets & offsets, const ColumnRawPtrs & columns, ColumnUInt32::Container & res_values);
116};
117
118
119void FunctionArrayUniq::executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/)
120{
121 const ColumnArray::Offsets * offsets = nullptr;
122 size_t num_arguments = arguments.size();
123 ColumnRawPtrs data_columns(num_arguments);
124
125 Columns array_holders;
126 for (size_t i = 0; i < num_arguments; ++i)
127 {
128 const ColumnPtr & array_ptr = block.getByPosition(arguments[i]).column;
129 const ColumnArray * array = checkAndGetColumn<ColumnArray>(array_ptr.get());
130 if (!array)
131 {
132 const ColumnConst * const_array = checkAndGetColumnConst<ColumnArray>(
133 block.getByPosition(arguments[i]).column.get());
134 if (!const_array)
135 throw Exception("Illegal column " + block.getByPosition(arguments[i]).column->getName()
136 + " of " + toString(i + 1) + "-th argument of function " + getName(),
137 ErrorCodes::ILLEGAL_COLUMN);
138 array_holders.emplace_back(const_array->convertToFullColumn());
139 array = checkAndGetColumn<ColumnArray>(array_holders.back().get());
140 }
141
142 const ColumnArray::Offsets & offsets_i = array->getOffsets();
143 if (i == 0)
144 offsets = &offsets_i;
145 else if (offsets_i != *offsets)
146 throw Exception("Lengths of all arrays passed to " + getName() + " must be equal.",
147 ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH);
148
149 auto * array_data = &array->getData();
150 data_columns[i] = array_data;
151 }
152
153 const NullMap * null_map = nullptr;
154
155 for (size_t i = 0; i < num_arguments; ++i)
156 {
157 if (auto * nullable_col = checkAndGetColumn<ColumnNullable>(*data_columns[i]))
158 {
159 if (num_arguments == 1)
160 data_columns[i] = &nullable_col->getNestedColumn();
161
162 null_map = &nullable_col->getNullMapData();
163 break;
164 }
165 }
166
167 auto res = ColumnUInt32::create();
168 ColumnUInt32::Container & res_values = res->getData();
169 res_values.resize(offsets->size());
170
171 if (num_arguments == 1)
172 {
173 if (!(executeNumber<UInt8>(*offsets, *data_columns[0], null_map, res_values)
174 || executeNumber<UInt16>(*offsets, *data_columns[0], null_map, res_values)
175 || executeNumber<UInt32>(*offsets, *data_columns[0], null_map, res_values)
176 || executeNumber<UInt64>(*offsets, *data_columns[0], null_map, res_values)
177 || executeNumber<Int8>(*offsets, *data_columns[0], null_map, res_values)
178 || executeNumber<Int16>(*offsets, *data_columns[0], null_map, res_values)
179 || executeNumber<Int32>(*offsets, *data_columns[0], null_map, res_values)
180 || executeNumber<Int64>(*offsets, *data_columns[0], null_map, res_values)
181 || executeNumber<Float32>(*offsets, *data_columns[0], null_map, res_values)
182 || executeNumber<Float64>(*offsets, *data_columns[0], null_map, res_values)
183 || executeFixedString(*offsets, *data_columns[0], null_map, res_values)
184 || executeString(*offsets, *data_columns[0], null_map, res_values)))
185 executeHashed(*offsets, data_columns, res_values);
186 }
187 else
188 {
189 if (!execute128bit(*offsets, data_columns, res_values))
190 executeHashed(*offsets, data_columns, res_values);
191 }
192
193 block.getByPosition(result).column = std::move(res);
194}
195
196template <typename Method, bool has_null_map>
197void FunctionArrayUniq::executeMethodImpl(
198 const ColumnArray::Offsets & offsets,
199 const ColumnRawPtrs & columns,
200 const Sizes & key_sizes,
201 [[maybe_unused]] const NullMap * null_map,
202 ColumnUInt32::Container & res_values)
203{
204 typename Method::Set set;
205 typename Method::Method method(columns, key_sizes, nullptr);
206 Arena pool; /// Won't use it;
207
208 ColumnArray::Offset prev_off = 0;
209 for (size_t i = 0; i < offsets.size(); ++i)
210 {
211 set.clear();
212 bool found_null = false;
213 ColumnArray::Offset off = offsets[i];
214 for (ColumnArray::Offset j = prev_off; j < off; ++j)
215 {
216 if constexpr (has_null_map)
217 {
218 if ((*null_map)[j])
219 {
220 found_null = true;
221 continue;
222 }
223 }
224
225 method.emplaceKey(set, j, pool);
226 }
227
228 res_values[i] = set.size() + found_null;
229 prev_off = off;
230 }
231}
232
233template <typename Method>
234void FunctionArrayUniq::executeMethod(
235 const ColumnArray::Offsets & offsets,
236 const ColumnRawPtrs & columns,
237 const Sizes & key_sizes,
238 const NullMap * null_map,
239 ColumnUInt32::Container & res_values)
240{
241 if (null_map)
242 executeMethodImpl<Method, true>(offsets, columns, key_sizes, null_map, res_values);
243 else
244 executeMethodImpl<Method, false>(offsets, columns, key_sizes, null_map, res_values);
245
246}
247
248template <typename T>
249bool FunctionArrayUniq::executeNumber(const ColumnArray::Offsets & offsets, const IColumn & data, const NullMap * null_map, ColumnUInt32::Container & res_values)
250{
251 const auto * nested = checkAndGetColumn<ColumnVector<T>>(&data);
252 if (!nested)
253 return false;
254
255 executeMethod<MethodOneNumber<T>>(offsets, {nested}, {}, null_map, res_values);
256 return true;
257}
258
259bool FunctionArrayUniq::executeString(const ColumnArray::Offsets & offsets, const IColumn & data, const NullMap * null_map, ColumnUInt32::Container & res_values)
260{
261 const auto * nested = checkAndGetColumn<ColumnString>(&data);
262 if (nested)
263 executeMethod<MethodString>(offsets, {nested}, {}, null_map, res_values);
264
265 return nested;
266}
267
268bool FunctionArrayUniq::executeFixedString(const ColumnArray::Offsets & offsets, const IColumn & data, const NullMap * null_map, ColumnUInt32::Container & res_values)
269{
270 const auto * nested = checkAndGetColumn<ColumnFixedString>(&data);
271 if (nested)
272 executeMethod<MethodFixedString>(offsets, {nested}, {}, null_map, res_values);
273
274 return nested;
275}
276
277bool FunctionArrayUniq::execute128bit(
278 const ColumnArray::Offsets & offsets,
279 const ColumnRawPtrs & columns,
280 ColumnUInt32::Container & res_values)
281{
282 size_t count = columns.size();
283 size_t keys_bytes = 0;
284 Sizes key_sizes(count);
285
286 for (size_t j = 0; j < count; ++j)
287 {
288 if (!columns[j]->isFixedAndContiguous())
289 return false;
290 key_sizes[j] = columns[j]->sizeOfValueIfFixed();
291 keys_bytes += key_sizes[j];
292 }
293
294 if (keys_bytes > 16)
295 return false;
296
297 executeMethod<MethodFixed>(offsets, columns, key_sizes, nullptr, res_values);
298 return true;
299}
300
301void FunctionArrayUniq::executeHashed(
302 const ColumnArray::Offsets & offsets,
303 const ColumnRawPtrs & columns,
304 ColumnUInt32::Container & res_values)
305{
306 executeMethod<MethodHashed>(offsets, columns, {}, nullptr, res_values);
307}
308
309
310void registerFunctionArrayUniq(FunctionFactory & factory)
311{
312 factory.registerFunction<FunctionArrayUniq>();
313}
314
315}
316