1#include <iostream>
2#include <iomanip>
3#include <vector>
4
5#include <unordered_map>
6
7#include <sparsehash/dense_hash_map>
8#include <sparsehash/sparse_hash_map>
9
10#include <Common/Stopwatch.h>
11/*
12#define DBMS_HASH_MAP_COUNT_COLLISIONS
13*/
14#include <Core/Types.h>
15#include <Core/Row.h>
16#include <IO/ReadBufferFromFile.h>
17#include <Compression/CompressedReadBuffer.h>
18#include <Common/HashTable/HashMap.h>
19#include <AggregateFunctions/IAggregateFunction.h>
20#include <AggregateFunctions/AggregateFunctionFactory.h>
21#include <DataTypes/DataTypesNumber.h>
22
23
24/** The test checks the speed of hash tables, simulating their use for aggregation.
25 * The first argument specifies the number of elements to be inserted.
26 * The second argument can be a number from 1 to 4 - the number of the data structure being tested.
27 * This is important, because if you run all the tests one by one, the results will be incorrect.
28 * (Due to the peculiarities of the work of the allocator, the first test takes advantage.)
29 *
30 * Depending on USE_AUTO_ARRAY, one of the structures is selected as the value.
31 * USE_AUTO_ARRAY = 0 - uses std::vector (hard-copy structure, sizeof = 24 bytes).
32 * USE_AUTO_ARRAY = 1 - uses AutoArray (a structure specially designed for such cases, sizeof = 8 bytes).
33 *
34 * That is, the test also allows you to compare AutoArray and std::vector.
35 *
36 * If USE_AUTO_ARRAY = 0, then HashMap confidently overtakes all.
37 * If USE_AUTO_ARRAY = 1, then HashMap is slightly less serious (20%) ahead of google::dense_hash_map.
38 *
39 * When using HashMap, AutoArray has a rather serious (40%) advantage over std::vector.
40 * And when using other hash tables, AutoArray even more seriously overtakes std::vector
41 * (up to three and a half times in the case of std::unordered_map and google::sparse_hash_map).
42 *
43 * HashMap, unlike google::dense_hash_map, much more depends on the quality of the hash function.
44 *
45 * PS. Measure everything yourself, otherwise I'm almost confused.
46 *
47 * PPS. Now the aggregation does not use an array of aggregate functions as values.
48 * States of aggregate functions were separated from the interface to manipulate them, and put in the pool.
49 * But in this test, there was something similar to the old scenario of using hash tables in the aggregation.
50 */
51
52#define USE_AUTO_ARRAY 0
53
54
55struct AlternativeHash
56{
57 size_t operator() (UInt64 x) const
58 {
59 x ^= x >> 23;
60 x *= 0x2127599bf4325c37ULL;
61 x ^= x >> 47;
62
63 return x;
64 }
65};
66
67
68#if defined(__x86_64__)
69
70struct CRC32Hash_
71{
72 size_t operator() (UInt64 x) const
73 {
74 UInt64 crc = -1ULL;
75 asm("crc32q %[x], %[crc]\n" : [crc] "+r" (crc) : [x] "rm" (x));
76 return crc;
77 }
78};
79
80#endif
81
82
83int main(int argc, char ** argv)
84{
85 using namespace DB;
86
87 using Key = UInt64;
88
89#if USE_AUTO_ARRAY
90 using Value = AutoArray<IAggregateFunction*>;
91#else
92 using Value = std::vector<IAggregateFunction*>;
93#endif
94
95 size_t n = argc < 2 ? 10000000 : atoi(argv[1]);
96 //size_t m = atoi(argv[2]);
97
98 AggregateFunctionFactory factory;
99 DataTypes data_types_empty;
100 DataTypes data_types_uint64;
101 data_types_uint64.push_back(std::make_shared<DataTypeUInt64>());
102
103 std::vector<Key> data(n);
104 Value value;
105
106 AggregateFunctionPtr func_count = factory.get("count", data_types_empty);
107 AggregateFunctionPtr func_avg = factory.get("avg", data_types_uint64);
108 AggregateFunctionPtr func_uniq = factory.get("uniq", data_types_uint64);
109
110 #define INIT \
111 { \
112 value.resize(3); \
113 \
114 value[0] = func_count.get(); \
115 value[1] = func_avg.get(); \
116 value[2] = func_uniq.get(); \
117 }
118
119 INIT
120
121#ifndef USE_AUTO_ARRAY
122 #undef INIT
123 #define INIT
124#endif
125
126 Row row(1);
127 row[0] = UInt64(0);
128
129 std::cerr << "sizeof(Key) = " << sizeof(Key) << ", sizeof(Value) = " << sizeof(Value) << std::endl;
130
131 {
132 Stopwatch watch;
133 /* for (size_t i = 0; i < n; ++i)
134 data[i] = rand() % m;
135
136 for (size_t i = 0; i < n; i += 10)
137 data[i] = 0;*/
138
139 ReadBufferFromFile in1("UniqID.bin");
140 CompressedReadBuffer in2(in1);
141
142 in2.readStrict(reinterpret_cast<char*>(data.data()), sizeof(data[0]) * n);
143
144 watch.stop();
145 std::cerr << std::fixed << std::setprecision(2)
146 << "Vector. Size: " << n
147 << ", elapsed: " << watch.elapsedSeconds()
148 << " (" << n / watch.elapsedSeconds() << " elem/sec.)"
149 << std::endl;
150 }
151
152 if (argc < 3 || atoi(argv[2]) == 1)
153 {
154 Stopwatch watch;
155
156 HashMap<Key, Value> map;
157 HashMap<Key, Value>::LookupResult it;
158 bool inserted;
159
160 for (size_t i = 0; i < n; ++i)
161 {
162 map.emplace(data[i], it, inserted);
163 if (inserted)
164 {
165 new (&it->getMapped()) Value;
166 std::swap(it->getMapped(), value);
167 INIT
168 }
169 }
170
171 watch.stop();
172 std::cerr << std::fixed << std::setprecision(2)
173 << "HashMap. Size: " << map.size()
174 << ", elapsed: " << watch.elapsedSeconds()
175 << " (" << n / watch.elapsedSeconds() << " elem/sec.)"
176#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
177 << ", collisions: " << map.getCollisions()
178#endif
179 << std::endl;
180 }
181
182 if (argc < 3 || atoi(argv[2]) == 2)
183 {
184 Stopwatch watch;
185
186 using Map = HashMap<Key, Value, AlternativeHash>;
187 Map map;
188 Map::LookupResult it;
189 bool inserted;
190
191 for (size_t i = 0; i < n; ++i)
192 {
193 map.emplace(data[i], it, inserted);
194 if (inserted)
195 {
196 new (&it->getMapped()) Value;
197 std::swap(it->getMapped(), value);
198 INIT
199 }
200 }
201
202 watch.stop();
203 std::cerr << std::fixed << std::setprecision(2)
204 << "HashMap, AlternativeHash. Size: " << map.size()
205 << ", elapsed: " << watch.elapsedSeconds()
206 << " (" << n / watch.elapsedSeconds() << " elem/sec.)"
207#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
208 << ", collisions: " << map.getCollisions()
209#endif
210 << std::endl;
211 }
212
213#if defined(__x86_64__)
214 if (argc < 3 || atoi(argv[2]) == 3)
215 {
216 Stopwatch watch;
217
218 using Map = HashMap<Key, Value, CRC32Hash_>;
219 Map map;
220 Map::LookupResult it;
221 bool inserted;
222
223 for (size_t i = 0; i < n; ++i)
224 {
225 map.emplace(data[i], it, inserted);
226 if (inserted)
227 {
228 new (&it->getMapped()) Value;
229 std::swap(it->getMapped(), value);
230 INIT
231 }
232 }
233
234 watch.stop();
235 std::cerr << std::fixed << std::setprecision(2)
236 << "HashMap, CRC32Hash. Size: " << map.size()
237 << ", elapsed: " << watch.elapsedSeconds()
238 << " (" << n / watch.elapsedSeconds() << " elem/sec.)"
239#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
240 << ", collisions: " << map.getCollisions()
241#endif
242 << std::endl;
243 }
244#endif
245
246 if (argc < 3 || atoi(argv[2]) == 4)
247 {
248 Stopwatch watch;
249
250 std::unordered_map<Key, Value, DefaultHash<Key>> map;
251 std::unordered_map<Key, Value, DefaultHash<Key>>::iterator it;
252 for (size_t i = 0; i < n; ++i)
253 {
254 it = map.insert(std::make_pair(data[i], value)).first;
255 INIT
256 }
257
258 watch.stop();
259 std::cerr << std::fixed << std::setprecision(2)
260 << "std::unordered_map. Size: " << map.size()
261 << ", elapsed: " << watch.elapsedSeconds()
262 << " (" << n / watch.elapsedSeconds() << " elem/sec.)"
263 << std::endl;
264 }
265
266 if (argc < 3 || atoi(argv[2]) == 5)
267 {
268 Stopwatch watch;
269
270 ::google::dense_hash_map<Key, Value, DefaultHash<Key>> map;
271 ::google::dense_hash_map<Key, Value, DefaultHash<Key>>::iterator it;
272 map.set_empty_key(-1ULL);
273 for (size_t i = 0; i < n; ++i)
274 {
275 it = map.insert(std::make_pair(data[i], value)).first;
276 INIT
277 }
278
279 watch.stop();
280 std::cerr << std::fixed << std::setprecision(2)
281 << "google::dense_hash_map. Size: " << map.size()
282 << ", elapsed: " << watch.elapsedSeconds()
283 << " (" << n / watch.elapsedSeconds() << " elem/sec.)"
284 << std::endl;
285 }
286
287 if (argc < 3 || atoi(argv[2]) == 6)
288 {
289 Stopwatch watch;
290
291 ::google::sparse_hash_map<Key, Value, DefaultHash<Key>> map;
292 ::google::sparse_hash_map<Key, Value, DefaultHash<Key>>::iterator it;
293 for (size_t i = 0; i < n; ++i)
294 {
295 map.insert(std::make_pair(data[i], value));
296 INIT
297 }
298
299 watch.stop();
300 std::cerr << std::fixed << std::setprecision(2)
301 << "google::sparse_hash_map. Size: " << map.size()
302 << ", elapsed: " << watch.elapsedSeconds()
303 << " (" << n / watch.elapsedSeconds() << " elem/sec.)"
304 << std::endl;
305 }
306
307 return 0;
308}
309