1#pragma once
2
3#include <city.h>
4#include <type_traits>
5
6#include <ext/bit_cast.h>
7
8#include <IO/WriteHelpers.h>
9#include <IO/ReadHelpers.h>
10
11#include <DataTypes/DataTypesNumber.h>
12#include <DataTypes/DataTypeTuple.h>
13
14#include <Interpreters/AggregationCommon.h>
15
16#include <Common/HashTable/HashSet.h>
17#include <Common/HyperLogLogWithSmallSetOptimization.h>
18#include <Common/CombinedCardinalityEstimator.h>
19#include <Common/typeid_cast.h>
20#include <Common/assert_cast.h>
21
22#include <AggregateFunctions/UniquesHashSet.h>
23#include <AggregateFunctions/IAggregateFunction.h>
24#include <AggregateFunctions/UniqVariadicHash.h>
25
26
27namespace DB
28{
29
30/// uniq
31
32struct AggregateFunctionUniqUniquesHashSetData
33{
34 using Set = UniquesHashSet<DefaultHash<UInt64>>;
35 Set set;
36
37 static String getName() { return "uniq"; }
38};
39
40/// For a function that takes multiple arguments. Such a function pre-hashes them in advance, so TrivialHash is used here.
41struct AggregateFunctionUniqUniquesHashSetDataForVariadic
42{
43 using Set = UniquesHashSet<TrivialHash>;
44 Set set;
45
46 static String getName() { return "uniq"; }
47};
48
49
50/// uniqHLL12
51
52template <typename T>
53struct AggregateFunctionUniqHLL12Data
54{
55 using Set = HyperLogLogWithSmallSetOptimization<T, 16, 12>;
56 Set set;
57
58 static String getName() { return "uniqHLL12"; }
59};
60
61template <>
62struct AggregateFunctionUniqHLL12Data<String>
63{
64 using Set = HyperLogLogWithSmallSetOptimization<UInt64, 16, 12>;
65 Set set;
66
67 static String getName() { return "uniqHLL12"; }
68};
69
70template <>
71struct AggregateFunctionUniqHLL12Data<UInt128>
72{
73 using Set = HyperLogLogWithSmallSetOptimization<UInt64, 16, 12>;
74 Set set;
75
76 static String getName() { return "uniqHLL12"; }
77};
78
79struct AggregateFunctionUniqHLL12DataForVariadic
80{
81 using Set = HyperLogLogWithSmallSetOptimization<UInt64, 16, 12, TrivialHash>;
82 Set set;
83
84 static String getName() { return "uniqHLL12"; }
85};
86
87
88/// uniqExact
89
90template <typename T>
91struct AggregateFunctionUniqExactData
92{
93 using Key = T;
94
95 /// When creating, the hash table must be small.
96 using Set = HashSet<
97 Key,
98 HashCRC32<Key>,
99 HashTableGrower<4>,
100 HashTableAllocatorWithStackMemory<sizeof(Key) * (1 << 4)>>;
101
102 Set set;
103
104 static String getName() { return "uniqExact"; }
105};
106
107/// For rows, we put the SipHash values (128 bits) into the hash table.
108template <>
109struct AggregateFunctionUniqExactData<String>
110{
111 using Key = UInt128;
112
113 /// When creating, the hash table must be small.
114 using Set = HashSet<
115 Key,
116 UInt128TrivialHash,
117 HashTableGrower<3>,
118 HashTableAllocatorWithStackMemory<sizeof(Key) * (1 << 3)>>;
119
120 Set set;
121
122 static String getName() { return "uniqExact"; }
123};
124
125
126namespace detail
127{
128
129/** Hash function for uniq.
130 */
131template <typename T> struct AggregateFunctionUniqTraits
132{
133 static UInt64 hash(T x) { return x; }
134};
135
136template <> struct AggregateFunctionUniqTraits<UInt128>
137{
138 static UInt64 hash(UInt128 x)
139 {
140 return sipHash64(x);
141 }
142};
143
144template <> struct AggregateFunctionUniqTraits<Float32>
145{
146 static UInt64 hash(Float32 x)
147 {
148 return ext::bit_cast<UInt64>(x);
149 }
150};
151
152template <> struct AggregateFunctionUniqTraits<Float64>
153{
154 static UInt64 hash(Float64 x)
155 {
156 return ext::bit_cast<UInt64>(x);
157 }
158};
159
160
161/** The structure for the delegation work to add one element to the `uniq` aggregate functions.
162 * Used for partial specialization to add strings.
163 */
164template <typename T, typename Data>
165struct OneAdder
166{
167 static void ALWAYS_INLINE add(Data & data, const IColumn & column, size_t row_num)
168 {
169 if constexpr (std::is_same_v<Data, AggregateFunctionUniqUniquesHashSetData>
170 || std::is_same_v<Data, AggregateFunctionUniqHLL12Data<T>>)
171 {
172 if constexpr (!std::is_same_v<T, String>)
173 {
174 const auto & value = assert_cast<const ColumnVector<T> &>(column).getElement(row_num);
175 data.set.insert(AggregateFunctionUniqTraits<T>::hash(value));
176 }
177 else
178 {
179 StringRef value = column.getDataAt(row_num);
180 data.set.insert(CityHash_v1_0_2::CityHash64(value.data, value.size));
181 }
182 }
183 else if constexpr (std::is_same_v<Data, AggregateFunctionUniqExactData<T>>)
184 {
185 if constexpr (!std::is_same_v<T, String>)
186 {
187 data.set.insert(assert_cast<const ColumnVector<T> &>(column).getData()[row_num]);
188 }
189 else
190 {
191 StringRef value = column.getDataAt(row_num);
192
193 UInt128 key;
194 SipHash hash;
195 hash.update(value.data, value.size);
196 hash.get128(key.low, key.high);
197
198 data.set.insert(key);
199 }
200 }
201 }
202};
203
204}
205
206
207/// Calculates the number of different values approximately or exactly.
208template <typename T, typename Data>
209class AggregateFunctionUniq final : public IAggregateFunctionDataHelper<Data, AggregateFunctionUniq<T, Data>>
210{
211public:
212 AggregateFunctionUniq(const DataTypes & argument_types_)
213 : IAggregateFunctionDataHelper<Data, AggregateFunctionUniq<T, Data>>(argument_types_, {}) {}
214
215 String getName() const override { return Data::getName(); }
216
217 DataTypePtr getReturnType() const override
218 {
219 return std::make_shared<DataTypeUInt64>();
220 }
221
222 /// ALWAYS_INLINE is required to have better code layout for uniqHLL12 function
223 void ALWAYS_INLINE add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena *) const override
224 {
225 detail::OneAdder<T, Data>::add(this->data(place), *columns[0], row_num);
226 }
227
228 void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override
229 {
230 this->data(place).set.merge(this->data(rhs).set);
231 }
232
233 void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
234 {
235 this->data(place).set.write(buf);
236 }
237
238 void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override
239 {
240 this->data(place).set.read(buf);
241 }
242
243 void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override
244 {
245 assert_cast<ColumnUInt64 &>(to).getData().push_back(this->data(place).set.size());
246 }
247};
248
249
250/** For multiple arguments. To compute, hashes them.
251 * You can pass multiple arguments as is; You can also pass one argument - a tuple.
252 * But (for the possibility of efficient implementation), you can not pass several arguments, among which there are tuples.
253 */
254template <typename Data, bool is_exact, bool argument_is_tuple>
255class AggregateFunctionUniqVariadic final : public IAggregateFunctionDataHelper<Data, AggregateFunctionUniqVariadic<Data, is_exact, argument_is_tuple>>
256{
257private:
258 size_t num_args = 0;
259
260public:
261 AggregateFunctionUniqVariadic(const DataTypes & arguments)
262 : IAggregateFunctionDataHelper<Data, AggregateFunctionUniqVariadic<Data, is_exact, argument_is_tuple>>(arguments, {})
263 {
264 if (argument_is_tuple)
265 num_args = typeid_cast<const DataTypeTuple &>(*arguments[0]).getElements().size();
266 else
267 num_args = arguments.size();
268 }
269
270 String getName() const override { return Data::getName(); }
271
272 DataTypePtr getReturnType() const override
273 {
274 return std::make_shared<DataTypeUInt64>();
275 }
276
277 void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena *) const override
278 {
279 this->data(place).set.insert(typename Data::Set::value_type(UniqVariadicHash<is_exact, argument_is_tuple>::apply(num_args, columns, row_num)));
280 }
281
282 void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override
283 {
284 this->data(place).set.merge(this->data(rhs).set);
285 }
286
287 void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
288 {
289 this->data(place).set.write(buf);
290 }
291
292 void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override
293 {
294 this->data(place).set.read(buf);
295 }
296
297 void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override
298 {
299 assert_cast<ColumnUInt64 &>(to).getData().push_back(this->data(place).set.size());
300 }
301};
302
303}
304