1 | #include <vector> |
2 | #include <string> |
3 | #include <iomanip> |
4 | |
5 | #include <Common/SipHash.h> |
6 | #include <IO/ReadBufferFromFileDescriptor.h> |
7 | #include <IO/ReadHelpers.h> |
8 | #include <Common/Stopwatch.h> |
9 | |
10 | |
11 | /** Test this way: |
12 | * |
13 | * clickhouse-client --query="SELECT SearchPhrase AS k FROM test.hits WHERE k != ''" > phrases.tsv |
14 | * clickhouse-client --query="SELECT URL AS k FROM test.hits" > urls.tsv |
15 | * clickhouse-client --query="SELECT SearchPhrase AS k FROM test.hits" > phrases_with_empty.tsv |
16 | * clickhouse-client --query="SELECT Title AS k FROM test.hits" > titles.tsv |
17 | * clickhouse-client --query="SELECT PageCharset AS k FROM test.hits" > charset.tsv |
18 | * |
19 | * for i in {1..1000}; do ./sip_hash_perf < titles.tsv 2>&1 | grep Processed | grep -oP '\d+\.\d+ rows/sec'; done | awk '{ if ($1 > x) { x = $1; print x } }' |
20 | */ |
21 | |
22 | |
23 | int main(int, char **) |
24 | { |
25 | std::vector<std::string> data; |
26 | DB::ReadBufferFromFileDescriptor in(STDIN_FILENO); |
27 | |
28 | std::cerr << std::fixed << std::setprecision(3); |
29 | |
30 | { |
31 | Stopwatch watch; |
32 | |
33 | while (!in.eof()) |
34 | { |
35 | data.emplace_back(); |
36 | DB::readEscapedString(data.back(), in); |
37 | DB::assertChar('\n', in); |
38 | } |
39 | |
40 | double seconds = watch.elapsedSeconds(); |
41 | std::cerr << "Read " |
42 | << data.size() << " rows, " |
43 | << (in.count() / 1048576.0) << " MiB " |
44 | << " in " << seconds << " sec., " |
45 | << (data.size() / seconds) << " rows/sec., " |
46 | << (in.count() / 1048576.0 / seconds) << " MiB/sec.\n" ; |
47 | } |
48 | |
49 | { |
50 | size_t res = 0; |
51 | Stopwatch watch; |
52 | |
53 | for (const auto & s : data) |
54 | { |
55 | SipHash hash; |
56 | hash.update(s.data(), s.size()); |
57 | res += hash.get64(); |
58 | } |
59 | |
60 | double seconds = watch.elapsedSeconds(); |
61 | std::cerr << "Processed " |
62 | << data.size() << " rows, " |
63 | << (in.count() / 1048576.0) << " MiB " |
64 | << " in " << seconds << " sec., " |
65 | << (data.size() / seconds) << " rows/sec., " |
66 | << (in.count() / 1048576.0 / seconds) << " MiB/sec. " |
67 | << "(res = " << res << ")\n" ; |
68 | } |
69 | |
70 | return 0; |
71 | } |
72 | |