1 | #include <iomanip> |
2 | #include <iostream> |
3 | #include <vector> |
4 | #include <Compression/CompressedReadBuffer.h> |
5 | #include <Core/Types.h> |
6 | #include <IO/ReadBufferFromFile.h> |
7 | #include <IO/ReadHelpers.h> |
8 | #include <Interpreters/AggregationCommon.h> |
9 | #include <Common/HashTable/HashMap.h> |
10 | #include <Common/HashTable/HashTableKeyHolder.h> |
11 | #include <Common/HashTable/StringHashMap.h> |
12 | #include <Common/Stopwatch.h> |
13 | #include <common/StringRef.h> |
14 | |
15 | /** |
16 | |
17 | #include <fstream> |
18 | #include <random> |
19 | |
20 | using namespace std; |
21 | |
22 | int main() |
23 | { |
24 | std::string s; |
25 | std::random_device dev; |
26 | std::mt19937 rng(dev()); |
27 | std::uniform_int_distribution<std::mt19937::result_type> dist(0, 25); |
28 | std::binomial_distribution<std::mt19937::result_type> binomial1(100, 0.01); |
29 | std::binomial_distribution<std::mt19937::result_type> binomial2(100, 0.02); |
30 | std::binomial_distribution<std::mt19937::result_type> binomial4(100, 0.04); |
31 | std::binomial_distribution<std::mt19937::result_type> binomial8(100, 0.08); |
32 | std::binomial_distribution<std::mt19937::result_type> binomial16(100, 0.16); |
33 | std::binomial_distribution<std::mt19937::result_type> binomial24(100, 0.24); |
34 | std::binomial_distribution<std::mt19937::result_type> binomial48(100, 0.48); |
35 | // 11GB |
36 | std::ofstream f("/tmp/terms.csv"); |
37 | size_t l1, l2, l4, l8, l16, l24, l48; |
38 | for (auto n = 0ul; n < 1e8; ++n) |
39 | { |
40 | l1 = binomial1(rng) + 1; |
41 | l2 = binomial2(rng) + l1 + 1; |
42 | l4 = binomial4(rng) + l2 + 1; |
43 | l8 = binomial8(rng) + l4 + 1; |
44 | l16 = binomial16(rng) + l8 + 1; |
45 | l24 = binomial24(rng) + l16 + 1; |
46 | l48 = binomial48(rng) + l24 + 1; |
47 | s.resize(l48); |
48 | for (auto i = 0ul; i < l48 - 1; ++i) |
49 | s[i] = 'a' + dist(rng); |
50 | s[l1 - 1] = ','; |
51 | s[l2 - 1] = ','; |
52 | s[l4 - 1] = ','; |
53 | s[l8 - 1] = ','; |
54 | s[l16 - 1] = ','; |
55 | s[l24 - 1] = ','; |
56 | s[l48 - 1] = '\n'; |
57 | f << s; |
58 | } |
59 | f.close(); |
60 | return 0; |
61 | } |
62 | |
63 | create table terms (term1 String, term2 String, term4 String, term8 String, term16 String, term24 String, term48 String) engine TinyLog; |
64 | insert into terms select * from file('/tmp/terms.csv', CSV, 'a String, b String, c String, d String, e String, f String, g String'); |
65 | |
66 | NOTE: for reliable test results, try isolating cpu cores and do python -m perf tune. Also bind numa nodes if any. |
67 | # isolate cpu 18 |
68 | dir=/home/amos/git/chorigin/data/data/default/terms |
69 | for file in term1 term2 term4 term8 term16 term24 term48; do |
70 | for size in 30000000 50000000 80000000 100000000; do |
71 | BEST_METHOD=0 |
72 | BEST_RESULT=0 |
73 | for method in {1..2}; do |
74 | echo -ne $file $size $method '' |
75 | numactl --membind=0 taskset -c 18 ./string_hash_map $size $method <"$dir"/"$file".bin 2>&1 | perl -nE 'say /([0-9\.]+) elem/g if /HashMap/' | tee /tmp/string_hash_map_res |
76 | CUR_RESULT=$(cat /tmp/string_hash_map_res | tr -d '.') |
77 | if [[ $CUR_RESULT -gt $BEST_RESULT ]]; then |
78 | BEST_METHOD=$method |
79 | BEST_RESULT=$CUR_RESULT |
80 | fi |
81 | done |
82 | echo Best: $BEST_METHOD - $BEST_RESULT |
83 | done |
84 | done |
85 | |
86 | --------------------------- |
87 | |
88 | term1 30000000 1 68785770.85 term2 30000000 1 42531788.83 term4 30000000 1 14759901.41 term8 30000000 1 8072903.47 |
89 | term1 30000000 2 40812128.16 term2 30000000 2 21352402.10 term4 30000000 2 9008907.80 term8 30000000 2 5822641.82 |
90 | Best: 1 - 6878577085 Best: 1 - 4253178883 Best: 1 - 1475990141 Best: 1 - 807290347 |
91 | term1 50000000 1 68027542.41 term2 50000000 1 40493742.80 term4 50000000 1 16827650.85 term8 50000000 1 7405230.14 |
92 | term1 50000000 2 37589806.02 term2 50000000 2 19362975.09 term4 50000000 2 8278094.11 term8 50000000 2 5106810.80 |
93 | Best: 1 - 6802754241 Best: 1 - 4049374280 Best: 1 - 1682765085 Best: 1 - 740523014 |
94 | term1 80000000 1 68651875.88 term2 80000000 1 38253695.50 term4 80000000 1 15847177.93 term8 80000000 1 7536319.25 |
95 | term1 80000000 2 38092189.20 term2 80000000 2 20287003.01 term4 80000000 2 9322770.34 term8 80000000 2 4355572.15 |
96 | Best: 1 - 6865187588 Best: 1 - 3825369550 Best: 1 - 1584717793 Best: 1 - 753631925 |
97 | term1 100000000 1 68641941.59 term2 100000000 1 39120834.79 term4 100000000 1 16773904.90 term8 100000000 1 7147146.55 |
98 | term1 100000000 2 38358006.72 term2 100000000 2 20629363.17 term4 100000000 2 9665201.92 term8 100000000 2 4728255.07 |
99 | Best: 1 - 6864194159 Best: 1 - 3912083479 Best: 1 - 1677390490 Best: 1 - 714714655 |
100 | |
101 | |
102 | term16 30000000 1 6823029.35 term24 30000000 1 5706271.14 term48 30000000 1 4695716.47 |
103 | term16 30000000 2 5672283.33 term24 30000000 2 5498855.56 term48 30000000 2 4860537.26 |
104 | Best: 1 - 682302935 Best: 1 - 570627114 Best: 2 - 486053726 |
105 | term16 50000000 1 6214581.25 term24 50000000 1 5249785.66 term48 50000000 1 4282606.12 |
106 | term16 50000000 2 4990361.44 term24 50000000 2 4855552.24 term48 50000000 2 4348923.29 |
107 | Best: 1 - 621458125 Best: 1 - 524978566 Best: 2 - 434892329 |
108 | term16 80000000 1 5382855.70 term24 80000000 1 4580133.04 term48 80000000 1 3779436.15 |
109 | term16 80000000 2 4282192.79 term24 80000000 2 4178791.09 term48 80000000 2 3788409.72 |
110 | Best: 1 - 538285570 Best: 1 - 458013304 Best: 2 - 378840972 |
111 | term16 100000000 1 5930103.42 term24 100000000 1 5030621.52 term48 100000000 1 4084666.73 |
112 | term16 100000000 2 4621719.60 term24 100000000 2 4499866.83 term48 100000000 2 4067029.31 |
113 | Best: 1 - 593010342 Best: 1 - 503062152 Best: 1 - 408466673 |
114 | |
115 | */ |
116 | |
117 | |
118 | using Value = uint64_t; |
119 | |
120 | template <typename Map> |
121 | void NO_INLINE bench(const std::vector<StringRef> & data, DB::Arena &, const char * name) |
122 | { |
123 | // warm up |
124 | /* |
125 | { |
126 | Map map; |
127 | typename Map::LookupResult it; |
128 | bool inserted; |
129 | |
130 | for (size_t i = 0, size = data.size(); i < size; ++i) |
131 | { |
132 | auto key_holder = DB::ArenaKeyHolder{data[i], pool}; |
133 | map.emplace(key_holder, it, inserted); |
134 | if (inserted) |
135 | it->getSecond() = 0; |
136 | ++it->getSecond(); |
137 | } |
138 | } |
139 | */ |
140 | |
141 | std::cerr << "method " << name << std::endl; |
142 | for (auto t = 0ul; t < 7; ++t) |
143 | { |
144 | DB::Arena pool(128 * 1024 * 1024); |
145 | Stopwatch watch; |
146 | Map map; |
147 | typename Map::LookupResult it; |
148 | bool inserted; |
149 | |
150 | for (size_t i = 0, size = data.size(); i < size; ++i) |
151 | { |
152 | map.emplace(DB::ArenaKeyHolder{data[i], pool}, it, inserted); |
153 | if (inserted) |
154 | it->getMapped() = 0; |
155 | ++it->getMapped(); |
156 | } |
157 | watch.stop(); |
158 | |
159 | std::cerr << "arena-memory " << pool.size() + map.getBufferSizeInBytes() << std::endl; |
160 | std::cerr << "single-run " << std::setprecision(3) |
161 | << watch.elapsedSeconds() << std::endl; |
162 | } |
163 | } |
164 | |
165 | /* |
166 | template <typename Map> |
167 | runFromFile() |
168 | { |
169 | DB::ReadBufferFromFileDescriptor in1(STDIN_FILENO); |
170 | DB::CompressedReadBuffer in2(in1); |
171 | |
172 | Map map; |
173 | DB::Arena pool(128 * 1024 * 1024); |
174 | for (size_t i = 0; i < n && !in2.eof(); ++i) |
175 | { |
176 | auto key = DB::readStringBinaryInto(pool, in2); |
177 | |
178 | bool inserted; |
179 | Map::LookupResult mapped; |
180 | map.emplaceKeyHolder(DB::SerializedKeyHolder(key, pool), mapped, inserted); |
181 | } |
182 | } |
183 | |
184 | template <typename Map> |
185 | benchFromFile() |
186 | { |
187 | double best_time = -1.; |
188 | for (auto t = 0ul; t < 50; ++t) |
189 | { |
190 | Stopwatch watch; |
191 | runFromFile(); |
192 | watch.stop(); |
193 | |
194 | if (best_time < 0 || best_time > watch.elapsedSeconds()) |
195 | { |
196 | best_time = watch.elapsedSeconds(); |
197 | } |
198 | } |
199 | |
200 | std::cerr << std::fixed << std::setprecision(2) << "HashMap (" << name << "). Elapsed: " << best_time << " (" << data.size() / best_time |
201 | << " elem/sec.)" << std::endl; |
202 | } |
203 | */ |
204 | |
205 | |
206 | int main(int argc, char ** argv) |
207 | { |
208 | if (argc < 3) |
209 | { |
210 | std::cerr << "Usage: program n m\n" ; |
211 | return 1; |
212 | } |
213 | |
214 | size_t n = atoi(argv[1]); |
215 | size_t m = atoi(argv[2]); |
216 | |
217 | DB::Arena pool(128 * 1024 * 1024); |
218 | std::vector<StringRef> data(n); |
219 | |
220 | std::cerr << "sizeof(Key) = " << sizeof(StringRef) << ", sizeof(Value) = " << sizeof(Value) << std::endl; |
221 | |
222 | { |
223 | Stopwatch watch; |
224 | DB::ReadBufferFromFileDescriptor in1(STDIN_FILENO); |
225 | DB::CompressedReadBuffer in2(in1); |
226 | |
227 | std::string tmp; |
228 | for (size_t i = 0; i < n && !in2.eof(); ++i) |
229 | { |
230 | DB::readStringBinary(tmp, in2); |
231 | data[i] = StringRef(pool.insert(tmp.data(), tmp.size()), tmp.size()); |
232 | } |
233 | |
234 | watch.stop(); |
235 | std::cerr << std::fixed << std::setprecision(2) << "Vector. Size: " << n << ", elapsed: " << watch.elapsedSeconds() << " (" |
236 | << n / watch.elapsedSeconds() << " elem/sec.)" << std::endl; |
237 | } |
238 | |
239 | if (!m || m == 1) |
240 | bench<StringHashMap<Value>>(data, pool, "StringHashMap" ); |
241 | if (!m || m == 2) |
242 | bench<HashMapWithSavedHash<StringRef, Value>>(data, pool, "HashMapWithSavedHash" ); |
243 | if (!m || m == 3) |
244 | bench<HashMap<StringRef, Value>>(data, pool, "HashMap" ); |
245 | return 0; |
246 | } |
247 | |