hash_map.cpp source code [ClickHouse/dbms/src/Interpreters/tests/hash_map.cpp]

1	#include <iostream>
2	#include <iomanip>
3	#include <vector>
4
5	#include <unordered_map>
6
7	#include <sparsehash/dense_hash_map>
8	#include <sparsehash/sparse_hash_map>
9
10	#include <Common/Stopwatch.h>
11	/*
12	#define DBMS_HASH_MAP_COUNT_COLLISIONS
13	*/
14	#include <Core/Types.h>
15	#include <Core/Row.h>
16	#include <IO/ReadBufferFromFile.h>
17	#include <Compression/CompressedReadBuffer.h>
18	#include <Common/HashTable/HashMap.h>
19	#include <AggregateFunctions/IAggregateFunction.h>
20	#include <AggregateFunctions/AggregateFunctionFactory.h>
21	#include <DataTypes/DataTypesNumber.h>
22
23
24	/* The test checks the speed of hash tables, simulating their use for aggregation.*
25	* The first argument specifies the number of elements to be inserted.
26	* The second argument can be a number from 1 to 4 - the number of the data structure being tested.
27	* This is important, because if you run all the tests one by one, the results will be incorrect.
28	* (Due to the peculiarities of the work of the allocator, the first test takes advantage.)
29	*
30	* Depending on USE_AUTO_ARRAY, one of the structures is selected as the value.
31	* USE_AUTO_ARRAY = 0 - uses std::vector (hard-copy structure, sizeof = 24 bytes).
32	* USE_AUTO_ARRAY = 1 - uses AutoArray (a structure specially designed for such cases, sizeof = 8 bytes).
33	*
34	* That is, the test also allows you to compare AutoArray and std::vector.
35	*
36	* If USE_AUTO_ARRAY = 0, then HashMap confidently overtakes all.
37	* If USE_AUTO_ARRAY = 1, then HashMap is slightly less serious (20%) ahead of google::dense_hash_map.
38	*
39	* When using HashMap, AutoArray has a rather serious (40%) advantage over std::vector.
40	* And when using other hash tables, AutoArray even more seriously overtakes std::vector
41	* (up to three and a half times in the case of std::unordered_map and google::sparse_hash_map).
42	*
43	* HashMap, unlike google::dense_hash_map, much more depends on the quality of the hash function.
44	*
45	* PS. Measure everything yourself, otherwise I'm almost confused.
46	*
47	* PPS. Now the aggregation does not use an array of aggregate functions as values.
48	* States of aggregate functions were separated from the interface to manipulate them, and put in the pool.
49	* But in this test, there was something similar to the old scenario of using hash tables in the aggregation.
50	*/
51
52	#define USE_AUTO_ARRAY 0
53
54
55	struct AlternativeHash
56	{
57	size_t operator() (UInt64 x) const
58	{
59	x ^= x >> `23`;
60	x *= `0x2127599bf4325c37ULL`;
61	x ^= x >> `47`;
62
63	return x;
64	}
65	};
66
67
68	#if defined(__x86_64__)
69
70	struct CRC32Hash_
71	{
72	size_t operator() (UInt64 x) const
73	{
74	UInt64 crc = -`1ULL`;
75	asm("crc32q %[x], %[crc]\n" : [crc] "+r" (crc) : [x] "rm" (x));
76	return crc;
77	}
78	};
79
80	#endif
81
82
83	int main(int argc, char ** argv)
84	{
85	using namespace DB;
86
87	using Key = UInt64;
88
89	#if USE_AUTO_ARRAY
90	using Value = AutoArray<IAggregateFunction*>;
91	#else
92	using Value = std::vector<IAggregateFunction*>;
93	#endif
94
95	size_t n = argc < `2` ? `10000000` : atoi(argv[`1`]);
96	//size_t m = atoi(argv[2]);
97
98	AggregateFunctionFactory factory;
99	DataTypes data_types_empty;
100	DataTypes data_types_uint64;
101	data_types_uint64.push_back(std::make_shared<DataTypeUInt64>());
102
103	std::vector<Key> data(n);
104	Value value;
105
106	AggregateFunctionPtr func_count = factory.get("count", data_types_empty);
107	AggregateFunctionPtr func_avg = factory.get("avg", data_types_uint64);
108	AggregateFunctionPtr func_uniq = factory.get("uniq", data_types_uint64);
109
110	#define INIT \
111	{ \
112	value.resize(3); \
113	\
114	value[0] = func_count.get(); \
115	value[1] = func_avg.get(); \
116	value[2] = func_uniq.get(); \
117	}
118
119	INIT
120
121	#ifndef USE_AUTO_ARRAY
122	#undef INIT
123	#define INIT
124	#endif
125
126	Row row(`1`);
127	row [`0`] = UInt64(`0`);
128
129	std::cerr << "sizeof(Key) = " << sizeof(Key) << ", sizeof(Value) = " << sizeof(Value) << std::endl;
130
131	{
132	Stopwatch watch;
133	/ for (size_t i = 0; i < n; ++i)*
134	data[i] = rand() % m;
135
136	for (size_t i = 0; i < n; i += 10)
137	data[i] = 0;/*
138
139	ReadBufferFromFile in1("UniqID.bin");
140	CompressedReadBuffer in2(in1);
141
142	in2.readStrict(reinterpret_cast<char>(data.data()), sizeof(data [`0`]) n);
143
144	watch.stop();
145	std::cerr << std::fixed << std::setprecision(`2`)
146	<< "Vector. Size: " << n
147	<< ", elapsed: " << watch.elapsedSeconds()
148	<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
149	<< std::endl;
150	}
151
152	if (argc < `3` \|\| atoi(argv[`2`]) == `1`)
153	{
154	Stopwatch watch;
155
156	HashMap<Key, Value> map;
157	HashMap<Key, Value>::LookupResult it;
158	bool inserted;
159
160	for (size_t i = `0`; i < n; ++i)
161	{
162	map.emplace(data [i], it, inserted);
163	if (inserted)
164	{
165	new (&it->getMapped()) Value;
166	std::swap(it->getMapped(), value);
167	INIT
168	}
169	}
170
171	watch.stop();
172	std::cerr << std::fixed << std::setprecision(`2`)
173	<< "HashMap. Size: " << map.size()
174	<< ", elapsed: " << watch.elapsedSeconds()
175	<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
176	#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
177	<< ", collisions: " << map.getCollisions()
178	#endif
179	<< std::endl;
180	}
181
182	if (argc < `3` \|\| atoi(argv[`2`]) == `2`)
183	{
184	Stopwatch watch;
185
186	using Map = HashMap<Key, Value, AlternativeHash>;
187	Map map;
188	Map::LookupResult it;
189	bool inserted;
190
191	for (size_t i = `0`; i < n; ++i)
192	{
193	map.emplace(data [i], it, inserted);
194	if (inserted)
195	{
196	new (&it->getMapped()) Value;
197	std::swap(it->getMapped(), value);
198	INIT
199	}
200	}
201
202	watch.stop();
203	std::cerr << std::fixed << std::setprecision(`2`)
204	<< "HashMap, AlternativeHash. Size: " << map.size()
205	<< ", elapsed: " << watch.elapsedSeconds()
206	<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
207	#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
208	<< ", collisions: " << map.getCollisions()
209	#endif
210	<< std::endl;
211	}
212
213	#if defined(__x86_64__)
214	if (argc < `3` \|\| atoi(argv[`2`]) == `3`)
215	{
216	Stopwatch watch;
217
218	using Map = HashMap<Key, Value, CRC32Hash_>;
219	Map map;
220	Map::LookupResult it;
221	bool inserted;
222
223	for (size_t i = `0`; i < n; ++i)
224	{
225	map.emplace(data [i], it, inserted);
226	if (inserted)
227	{
228	new (&it->getMapped()) Value;
229	std::swap(it->getMapped(), value);
230	INIT
231	}
232	}
233
234	watch.stop();
235	std::cerr << std::fixed << std::setprecision(`2`)
236	<< "HashMap, CRC32Hash. Size: " << map.size()
237	<< ", elapsed: " << watch.elapsedSeconds()
238	<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
239	#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
240	<< ", collisions: " << map.getCollisions()
241	#endif
242	<< std::endl;
243	}
244	#endif
245
246	if (argc < `3` \|\| atoi(argv[`2`]) == `4`)
247	{
248	Stopwatch watch;
249
250	std::unordered_map<Key, Value, DefaultHash<Key>> map;
251	std::unordered_map<Key, Value, DefaultHash<Key>>::iterator it;
252	for (size_t i = `0`; i < n; ++i)
253	{
254	it = map.insert(std::make_pair(data [i], value)).first;
255	INIT
256	}
257
258	watch.stop();
259	std::cerr << std::fixed << std::setprecision(`2`)
260	<< "std::unordered_map. Size: " << map.size()
261	<< ", elapsed: " << watch.elapsedSeconds()
262	<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
263	<< std::endl;
264	}
265
266	if (argc < `3` \|\| atoi(argv[`2`]) == `5`)
267	{
268	Stopwatch watch;
269
270	::google::dense_hash_map<Key, Value, DefaultHash<Key>> map;
271	::google::dense_hash_map<Key, Value, DefaultHash<Key>>::iterator it;
272	map.set_empty_key(-`1ULL`);
273	for (size_t i = `0`; i < n; ++i)
274	{
275	it = map.insert(std::make_pair(data [i], value)).first;
276	INIT
277	}
278
279	watch.stop();
280	std::cerr << std::fixed << std::setprecision(`2`)
281	<< "google::dense_hash_map. Size: " << map.size()
282	<< ", elapsed: " << watch.elapsedSeconds()
283	<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
284	<< std::endl;
285	}
286
287	if (argc < `3` \|\| atoi(argv[`2`]) == `6`)
288	{
289	Stopwatch watch;
290
291	::google::sparse_hash_map<Key, Value, DefaultHash<Key>> map;
292	::google::sparse_hash_map<Key, Value, DefaultHash<Key>>::iterator it;
293	for (size_t i = `0`; i < n; ++i)
294	{
295	map.insert(std::make_pair(data [i], value));
296	INIT
297	}
298
299	watch.stop();
300	std::cerr << std::fixed << std::setprecision(`2`)
301	<< "google::sparse_hash_map. Size: " << map.size()
302	<< ", elapsed: " << watch.elapsedSeconds()
303	<< " (" << n / watch.elapsedSeconds() << " elem/sec.)"
304	<< std::endl;
305	}
306
307	return `0`;
308	}
309

Browse the source code of ClickHouse/dbms/src/Interpreters/tests/hash_map.cpp