1#include <iomanip>
2#include <iostream>
3#include <vector>
4#include <Compression/CompressedReadBuffer.h>
5#include <Core/Types.h>
6#include <IO/ReadBufferFromFile.h>
7#include <IO/ReadHelpers.h>
8#include <Interpreters/AggregationCommon.h>
9#include <Common/HashTable/HashMap.h>
10#include <Common/HashTable/HashTableKeyHolder.h>
11#include <Common/HashTable/StringHashMap.h>
12#include <Common/Stopwatch.h>
13#include <common/StringRef.h>
14
15/**
16
17#include <fstream>
18#include <random>
19
20using namespace std;
21
22int main()
23{
24 std::string s;
25 std::random_device dev;
26 std::mt19937 rng(dev());
27 std::uniform_int_distribution<std::mt19937::result_type> dist(0, 25);
28 std::binomial_distribution<std::mt19937::result_type> binomial1(100, 0.01);
29 std::binomial_distribution<std::mt19937::result_type> binomial2(100, 0.02);
30 std::binomial_distribution<std::mt19937::result_type> binomial4(100, 0.04);
31 std::binomial_distribution<std::mt19937::result_type> binomial8(100, 0.08);
32 std::binomial_distribution<std::mt19937::result_type> binomial16(100, 0.16);
33 std::binomial_distribution<std::mt19937::result_type> binomial24(100, 0.24);
34 std::binomial_distribution<std::mt19937::result_type> binomial48(100, 0.48);
35 // 11GB
36 std::ofstream f("/tmp/terms.csv");
37 size_t l1, l2, l4, l8, l16, l24, l48;
38 for (auto n = 0ul; n < 1e8; ++n)
39 {
40 l1 = binomial1(rng) + 1;
41 l2 = binomial2(rng) + l1 + 1;
42 l4 = binomial4(rng) + l2 + 1;
43 l8 = binomial8(rng) + l4 + 1;
44 l16 = binomial16(rng) + l8 + 1;
45 l24 = binomial24(rng) + l16 + 1;
46 l48 = binomial48(rng) + l24 + 1;
47 s.resize(l48);
48 for (auto i = 0ul; i < l48 - 1; ++i)
49 s[i] = 'a' + dist(rng);
50 s[l1 - 1] = ',';
51 s[l2 - 1] = ',';
52 s[l4 - 1] = ',';
53 s[l8 - 1] = ',';
54 s[l16 - 1] = ',';
55 s[l24 - 1] = ',';
56 s[l48 - 1] = '\n';
57 f << s;
58 }
59 f.close();
60 return 0;
61}
62
63create table terms (term1 String, term2 String, term4 String, term8 String, term16 String, term24 String, term48 String) engine TinyLog;
64insert into terms select * from file('/tmp/terms.csv', CSV, 'a String, b String, c String, d String, e String, f String, g String');
65
66NOTE: for reliable test results, try isolating cpu cores and do python -m perf tune. Also bind numa nodes if any.
67# isolate cpu 18
68dir=/home/amos/git/chorigin/data/data/default/terms
69for file in term1 term2 term4 term8 term16 term24 term48; do
70 for size in 30000000 50000000 80000000 100000000; do
71 BEST_METHOD=0
72 BEST_RESULT=0
73 for method in {1..2}; do
74 echo -ne $file $size $method ''
75 numactl --membind=0 taskset -c 18 ./string_hash_map $size $method <"$dir"/"$file".bin 2>&1 | perl -nE 'say /([0-9\.]+) elem/g if /HashMap/' | tee /tmp/string_hash_map_res
76 CUR_RESULT=$(cat /tmp/string_hash_map_res | tr -d '.')
77 if [[ $CUR_RESULT -gt $BEST_RESULT ]]; then
78 BEST_METHOD=$method
79 BEST_RESULT=$CUR_RESULT
80 fi
81 done
82 echo Best: $BEST_METHOD - $BEST_RESULT
83 done
84done
85
86---------------------------
87
88term1 30000000 1 68785770.85 term2 30000000 1 42531788.83 term4 30000000 1 14759901.41 term8 30000000 1 8072903.47
89term1 30000000 2 40812128.16 term2 30000000 2 21352402.10 term4 30000000 2 9008907.80 term8 30000000 2 5822641.82
90Best: 1 - 6878577085 Best: 1 - 4253178883 Best: 1 - 1475990141 Best: 1 - 807290347
91term1 50000000 1 68027542.41 term2 50000000 1 40493742.80 term4 50000000 1 16827650.85 term8 50000000 1 7405230.14
92term1 50000000 2 37589806.02 term2 50000000 2 19362975.09 term4 50000000 2 8278094.11 term8 50000000 2 5106810.80
93Best: 1 - 6802754241 Best: 1 - 4049374280 Best: 1 - 1682765085 Best: 1 - 740523014
94term1 80000000 1 68651875.88 term2 80000000 1 38253695.50 term4 80000000 1 15847177.93 term8 80000000 1 7536319.25
95term1 80000000 2 38092189.20 term2 80000000 2 20287003.01 term4 80000000 2 9322770.34 term8 80000000 2 4355572.15
96Best: 1 - 6865187588 Best: 1 - 3825369550 Best: 1 - 1584717793 Best: 1 - 753631925
97term1 100000000 1 68641941.59 term2 100000000 1 39120834.79 term4 100000000 1 16773904.90 term8 100000000 1 7147146.55
98term1 100000000 2 38358006.72 term2 100000000 2 20629363.17 term4 100000000 2 9665201.92 term8 100000000 2 4728255.07
99Best: 1 - 6864194159 Best: 1 - 3912083479 Best: 1 - 1677390490 Best: 1 - 714714655
100
101
102term16 30000000 1 6823029.35 term24 30000000 1 5706271.14 term48 30000000 1 4695716.47
103term16 30000000 2 5672283.33 term24 30000000 2 5498855.56 term48 30000000 2 4860537.26
104Best: 1 - 682302935 Best: 1 - 570627114 Best: 2 - 486053726
105term16 50000000 1 6214581.25 term24 50000000 1 5249785.66 term48 50000000 1 4282606.12
106term16 50000000 2 4990361.44 term24 50000000 2 4855552.24 term48 50000000 2 4348923.29
107Best: 1 - 621458125 Best: 1 - 524978566 Best: 2 - 434892329
108term16 80000000 1 5382855.70 term24 80000000 1 4580133.04 term48 80000000 1 3779436.15
109term16 80000000 2 4282192.79 term24 80000000 2 4178791.09 term48 80000000 2 3788409.72
110Best: 1 - 538285570 Best: 1 - 458013304 Best: 2 - 378840972
111term16 100000000 1 5930103.42 term24 100000000 1 5030621.52 term48 100000000 1 4084666.73
112term16 100000000 2 4621719.60 term24 100000000 2 4499866.83 term48 100000000 2 4067029.31
113Best: 1 - 593010342 Best: 1 - 503062152 Best: 1 - 408466673
114
115*/
116
117
118using Value = uint64_t;
119
120template <typename Map>
121void NO_INLINE bench(const std::vector<StringRef> & data, DB::Arena &, const char * name)
122{
123 // warm up
124 /*
125 {
126 Map map;
127 typename Map::LookupResult it;
128 bool inserted;
129
130 for (size_t i = 0, size = data.size(); i < size; ++i)
131 {
132 auto key_holder = DB::ArenaKeyHolder{data[i], pool};
133 map.emplace(key_holder, it, inserted);
134 if (inserted)
135 it->getSecond() = 0;
136 ++it->getSecond();
137 }
138 }
139 */
140
141 std::cerr << "method " << name << std::endl;
142 for (auto t = 0ul; t < 7; ++t)
143 {
144 DB::Arena pool(128 * 1024 * 1024);
145 Stopwatch watch;
146 Map map;
147 typename Map::LookupResult it;
148 bool inserted;
149
150 for (size_t i = 0, size = data.size(); i < size; ++i)
151 {
152 map.emplace(DB::ArenaKeyHolder{data[i], pool}, it, inserted);
153 if (inserted)
154 it->getMapped() = 0;
155 ++it->getMapped();
156 }
157 watch.stop();
158
159 std::cerr << "arena-memory " << pool.size() + map.getBufferSizeInBytes() << std::endl;
160 std::cerr << "single-run " << std::setprecision(3)
161 << watch.elapsedSeconds() << std::endl;
162 }
163}
164
165/*
166template <typename Map>
167runFromFile()
168{
169 DB::ReadBufferFromFileDescriptor in1(STDIN_FILENO);
170 DB::CompressedReadBuffer in2(in1);
171
172 Map map;
173 DB::Arena pool(128 * 1024 * 1024);
174 for (size_t i = 0; i < n && !in2.eof(); ++i)
175 {
176 auto key = DB::readStringBinaryInto(pool, in2);
177
178 bool inserted;
179 Map::LookupResult mapped;
180 map.emplaceKeyHolder(DB::SerializedKeyHolder(key, pool), mapped, inserted);
181 }
182}
183
184template <typename Map>
185benchFromFile()
186{
187 double best_time = -1.;
188 for (auto t = 0ul; t < 50; ++t)
189 {
190 Stopwatch watch;
191 runFromFile();
192 watch.stop();
193
194 if (best_time < 0 || best_time > watch.elapsedSeconds())
195 {
196 best_time = watch.elapsedSeconds();
197 }
198 }
199
200 std::cerr << std::fixed << std::setprecision(2) << "HashMap (" << name << "). Elapsed: " << best_time << " (" << data.size() / best_time
201 << " elem/sec.)" << std::endl;
202}
203*/
204
205
206int main(int argc, char ** argv)
207{
208 if (argc < 3)
209 {
210 std::cerr << "Usage: program n m\n";
211 return 1;
212 }
213
214 size_t n = atoi(argv[1]);
215 size_t m = atoi(argv[2]);
216
217 DB::Arena pool(128 * 1024 * 1024);
218 std::vector<StringRef> data(n);
219
220 std::cerr << "sizeof(Key) = " << sizeof(StringRef) << ", sizeof(Value) = " << sizeof(Value) << std::endl;
221
222 {
223 Stopwatch watch;
224 DB::ReadBufferFromFileDescriptor in1(STDIN_FILENO);
225 DB::CompressedReadBuffer in2(in1);
226
227 std::string tmp;
228 for (size_t i = 0; i < n && !in2.eof(); ++i)
229 {
230 DB::readStringBinary(tmp, in2);
231 data[i] = StringRef(pool.insert(tmp.data(), tmp.size()), tmp.size());
232 }
233
234 watch.stop();
235 std::cerr << std::fixed << std::setprecision(2) << "Vector. Size: " << n << ", elapsed: " << watch.elapsedSeconds() << " ("
236 << n / watch.elapsedSeconds() << " elem/sec.)" << std::endl;
237 }
238
239 if (!m || m == 1)
240 bench<StringHashMap<Value>>(data, pool, "StringHashMap");
241 if (!m || m == 2)
242 bench<HashMapWithSavedHash<StringRef, Value>>(data, pool, "HashMapWithSavedHash");
243 if (!m || m == 3)
244 bench<HashMap<StringRef, Value>>(data, pool, "HashMap");
245 return 0;
246}
247