1 | #include <Functions/IFunctionImpl.h> |
2 | #include <Functions/FunctionFactory.h> |
3 | #include <Functions/FunctionHelpers.h> |
4 | #include <DataTypes/DataTypeArray.h> |
5 | #include <DataTypes/DataTypeNullable.h> |
6 | #include <Columns/ColumnArray.h> |
7 | #include <Columns/ColumnNullable.h> |
8 | #include <Columns/ColumnString.h> |
9 | #include <Common/HashTable/ClearableHashSet.h> |
10 | #include <Common/SipHash.h> |
11 | #include <Common/assert_cast.h> |
12 | |
13 | |
14 | namespace DB |
15 | { |
16 | |
17 | namespace ErrorCodes |
18 | { |
19 | extern const int ILLEGAL_TYPE_OF_ARGUMENT; |
20 | } |
21 | |
22 | |
23 | /// Find different elements in an array. |
24 | class FunctionArrayDistinct : public IFunction |
25 | { |
26 | public: |
27 | static constexpr auto name = "arrayDistinct" ; |
28 | |
29 | static FunctionPtr create(const Context &) |
30 | { |
31 | return std::make_shared<FunctionArrayDistinct>(); |
32 | } |
33 | |
34 | String getName() const override |
35 | { |
36 | return name; |
37 | } |
38 | |
39 | bool isVariadic() const override { return false; } |
40 | |
41 | size_t getNumberOfArguments() const override { return 1; } |
42 | |
43 | bool useDefaultImplementationForConstants() const override { return true; } |
44 | |
45 | DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override |
46 | { |
47 | const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[0].get()); |
48 | if (!array_type) |
49 | throw Exception("Argument for function " + getName() + " must be array but it " |
50 | " has type " + arguments[0]->getName() + "." , |
51 | ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
52 | |
53 | auto nested_type = removeNullable(array_type->getNestedType()); |
54 | |
55 | return std::make_shared<DataTypeArray>(nested_type); |
56 | } |
57 | |
58 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override; |
59 | |
60 | private: |
61 | /// Initially allocate a piece of memory for 512 elements. NOTE: This is just a guess. |
62 | static constexpr size_t INITIAL_SIZE_DEGREE = 9; |
63 | |
64 | template <typename T> |
65 | bool executeNumber( |
66 | const IColumn & src_data, |
67 | const ColumnArray::Offsets & src_offsets, |
68 | IColumn & res_data_col, |
69 | ColumnArray::Offsets & res_offsets, |
70 | const ColumnNullable * nullable_col); |
71 | |
72 | bool executeString( |
73 | const IColumn & src_data, |
74 | const ColumnArray::Offsets & src_offsets, |
75 | IColumn & res_data_col, |
76 | ColumnArray::Offsets & res_offsets, |
77 | const ColumnNullable * nullable_col); |
78 | |
79 | void executeHashed( |
80 | const IColumn & src_data, |
81 | const ColumnArray::Offsets & src_offsets, |
82 | IColumn & res_data_col, |
83 | ColumnArray::Offsets & res_offsets, |
84 | const ColumnNullable * nullable_col); |
85 | }; |
86 | |
87 | |
88 | void FunctionArrayDistinct::executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) |
89 | { |
90 | ColumnPtr array_ptr = block.getByPosition(arguments[0]).column; |
91 | const ColumnArray * array = checkAndGetColumn<ColumnArray>(array_ptr.get()); |
92 | |
93 | const auto & return_type = block.getByPosition(result).type; |
94 | |
95 | auto res_ptr = return_type->createColumn(); |
96 | ColumnArray & res = assert_cast<ColumnArray &>(*res_ptr); |
97 | |
98 | const IColumn & src_data = array->getData(); |
99 | const ColumnArray::Offsets & offsets = array->getOffsets(); |
100 | |
101 | IColumn & res_data = res.getData(); |
102 | ColumnArray::Offsets & res_offsets = res.getOffsets(); |
103 | |
104 | const ColumnNullable * nullable_col = checkAndGetColumn<ColumnNullable>(src_data); |
105 | |
106 | const IColumn * inner_col; |
107 | |
108 | if (nullable_col) |
109 | { |
110 | inner_col = &nullable_col->getNestedColumn(); |
111 | } |
112 | else |
113 | { |
114 | inner_col = &src_data; |
115 | } |
116 | |
117 | if (!(executeNumber<UInt8>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
118 | || executeNumber<UInt16>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
119 | || executeNumber<UInt32>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
120 | || executeNumber<UInt64>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
121 | || executeNumber<Int8>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
122 | || executeNumber<Int16>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
123 | || executeNumber<Int32>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
124 | || executeNumber<Int64>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
125 | || executeNumber<Float32>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
126 | || executeNumber<Float64>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
127 | || executeString(*inner_col, offsets, res_data, res_offsets, nullable_col))) |
128 | executeHashed(*inner_col, offsets, res_data, res_offsets, nullable_col); |
129 | |
130 | block.getByPosition(result).column = std::move(res_ptr); |
131 | } |
132 | |
133 | template <typename T> |
134 | bool FunctionArrayDistinct::executeNumber( |
135 | const IColumn & src_data, |
136 | const ColumnArray::Offsets & src_offsets, |
137 | IColumn & res_data_col, |
138 | ColumnArray::Offsets & res_offsets, |
139 | const ColumnNullable * nullable_col) |
140 | { |
141 | const ColumnVector<T> * src_data_concrete = checkAndGetColumn<ColumnVector<T>>(&src_data); |
142 | |
143 | if (!src_data_concrete) |
144 | { |
145 | return false; |
146 | } |
147 | |
148 | const PaddedPODArray<T> & values = src_data_concrete->getData(); |
149 | PaddedPODArray<T> & res_data = typeid_cast<ColumnVector<T> &>(res_data_col).getData(); |
150 | |
151 | const PaddedPODArray<UInt8> * src_null_map = nullptr; |
152 | |
153 | if (nullable_col) |
154 | src_null_map = &nullable_col->getNullMapData(); |
155 | |
156 | using Set = ClearableHashSet<T, |
157 | DefaultHash<T>, |
158 | HashTableGrower<INITIAL_SIZE_DEGREE>, |
159 | HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(T)>>; |
160 | |
161 | Set set; |
162 | |
163 | ColumnArray::Offset prev_src_offset = 0; |
164 | ColumnArray::Offset res_offset = 0; |
165 | |
166 | for (ColumnArray::Offset i = 0; i < src_offsets.size(); ++i) |
167 | { |
168 | set.clear(); |
169 | |
170 | ColumnArray::Offset curr_src_offset = src_offsets[i]; |
171 | for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) |
172 | { |
173 | if (nullable_col && (*src_null_map)[j]) |
174 | continue; |
175 | |
176 | if (!set.find(values[j])) |
177 | { |
178 | res_data.emplace_back(values[j]); |
179 | set.insert(values[j]); |
180 | } |
181 | } |
182 | |
183 | res_offset += set.size(); |
184 | res_offsets.emplace_back(res_offset); |
185 | |
186 | prev_src_offset = curr_src_offset; |
187 | } |
188 | return true; |
189 | } |
190 | |
191 | bool FunctionArrayDistinct::executeString( |
192 | const IColumn & src_data, |
193 | const ColumnArray::Offsets & src_offsets, |
194 | IColumn & res_data_col, |
195 | ColumnArray::Offsets & res_offsets, |
196 | const ColumnNullable * nullable_col) |
197 | { |
198 | const ColumnString * src_data_concrete = checkAndGetColumn<ColumnString>(&src_data); |
199 | |
200 | if (!src_data_concrete) |
201 | return false; |
202 | |
203 | ColumnString & res_data_column_string = typeid_cast<ColumnString &>(res_data_col); |
204 | |
205 | using Set = ClearableHashSet<StringRef, |
206 | StringRefHash, |
207 | HashTableGrower<INITIAL_SIZE_DEGREE>, |
208 | HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(StringRef)>>; |
209 | |
210 | const PaddedPODArray<UInt8> * src_null_map = nullptr; |
211 | |
212 | if (nullable_col) |
213 | src_null_map = &nullable_col->getNullMapData(); |
214 | |
215 | Set set; |
216 | |
217 | ColumnArray::Offset prev_src_offset = 0; |
218 | ColumnArray::Offset res_offset = 0; |
219 | |
220 | for (ColumnArray::Offset i = 0; i < src_offsets.size(); ++i) |
221 | { |
222 | set.clear(); |
223 | |
224 | ColumnArray::Offset curr_src_offset = src_offsets[i]; |
225 | for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) |
226 | { |
227 | if (nullable_col && (*src_null_map)[j]) |
228 | continue; |
229 | |
230 | StringRef str_ref = src_data_concrete->getDataAt(j); |
231 | |
232 | if (!set.find(str_ref)) |
233 | { |
234 | set.insert(str_ref); |
235 | res_data_column_string.insertData(str_ref.data, str_ref.size); |
236 | } |
237 | } |
238 | |
239 | res_offset += set.size(); |
240 | res_offsets.emplace_back(res_offset); |
241 | |
242 | prev_src_offset = curr_src_offset; |
243 | } |
244 | return true; |
245 | } |
246 | |
247 | void FunctionArrayDistinct::executeHashed( |
248 | const IColumn & src_data, |
249 | const ColumnArray::Offsets & src_offsets, |
250 | IColumn & res_data_col, |
251 | ColumnArray::Offsets & res_offsets, |
252 | const ColumnNullable * nullable_col) |
253 | { |
254 | using Set = ClearableHashSet<UInt128, UInt128TrivialHash, HashTableGrower<INITIAL_SIZE_DEGREE>, |
255 | HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(UInt128)>>; |
256 | |
257 | const PaddedPODArray<UInt8> * src_null_map = nullptr; |
258 | |
259 | if (nullable_col) |
260 | src_null_map = &nullable_col->getNullMapData(); |
261 | |
262 | Set set; |
263 | |
264 | ColumnArray::Offset prev_src_offset = 0; |
265 | ColumnArray::Offset res_offset = 0; |
266 | |
267 | for (ColumnArray::Offset i = 0; i < src_offsets.size(); ++i) |
268 | { |
269 | set.clear(); |
270 | |
271 | ColumnArray::Offset curr_src_offset = src_offsets[i]; |
272 | for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) |
273 | { |
274 | if (nullable_col && (*src_null_map)[j]) |
275 | continue; |
276 | |
277 | UInt128 hash; |
278 | SipHash hash_function; |
279 | src_data.updateHashWithValue(j, hash_function); |
280 | hash_function.get128(reinterpret_cast<char *>(&hash)); |
281 | |
282 | if (!set.find(hash)) |
283 | { |
284 | set.insert(hash); |
285 | res_data_col.insertFrom(src_data, j); |
286 | } |
287 | } |
288 | |
289 | res_offset += set.size(); |
290 | res_offsets.emplace_back(res_offset); |
291 | |
292 | prev_src_offset = curr_src_offset; |
293 | } |
294 | } |
295 | |
296 | |
297 | void registerFunctionArrayDistinct(FunctionFactory & factory) |
298 | { |
299 | factory.registerFunction<FunctionArrayDistinct>(); |
300 | } |
301 | |
302 | } |
303 | |