1#include <vector>
2#include <string>
3#include <iomanip>
4
5#include <Common/SipHash.h>
6#include <IO/ReadBufferFromFileDescriptor.h>
7#include <IO/ReadHelpers.h>
8#include <Common/Stopwatch.h>
9
10
11/** Test this way:
12 *
13 * clickhouse-client --query="SELECT SearchPhrase AS k FROM test.hits WHERE k != ''" > phrases.tsv
14 * clickhouse-client --query="SELECT URL AS k FROM test.hits" > urls.tsv
15 * clickhouse-client --query="SELECT SearchPhrase AS k FROM test.hits" > phrases_with_empty.tsv
16 * clickhouse-client --query="SELECT Title AS k FROM test.hits" > titles.tsv
17 * clickhouse-client --query="SELECT PageCharset AS k FROM test.hits" > charset.tsv
18 *
19 * for i in {1..1000}; do ./sip_hash_perf < titles.tsv 2>&1 | grep Processed | grep -oP '\d+\.\d+ rows/sec'; done | awk '{ if ($1 > x) { x = $1; print x } }'
20 */
21
22
23int main(int, char **)
24{
25 std::vector<std::string> data;
26 DB::ReadBufferFromFileDescriptor in(STDIN_FILENO);
27
28 std::cerr << std::fixed << std::setprecision(3);
29
30 {
31 Stopwatch watch;
32
33 while (!in.eof())
34 {
35 data.emplace_back();
36 DB::readEscapedString(data.back(), in);
37 DB::assertChar('\n', in);
38 }
39
40 double seconds = watch.elapsedSeconds();
41 std::cerr << "Read "
42 << data.size() << " rows, "
43 << (in.count() / 1048576.0) << " MiB "
44 << " in " << seconds << " sec., "
45 << (data.size() / seconds) << " rows/sec., "
46 << (in.count() / 1048576.0 / seconds) << " MiB/sec.\n";
47 }
48
49 {
50 size_t res = 0;
51 Stopwatch watch;
52
53 for (const auto & s : data)
54 {
55 SipHash hash;
56 hash.update(s.data(), s.size());
57 res += hash.get64();
58 }
59
60 double seconds = watch.elapsedSeconds();
61 std::cerr << "Processed "
62 << data.size() << " rows, "
63 << (in.count() / 1048576.0) << " MiB "
64 << " in " << seconds << " sec., "
65 << (data.size() / seconds) << " rows/sec., "
66 << (in.count() / 1048576.0 / seconds) << " MiB/sec. "
67 << "(res = " << res << ")\n";
68 }
69
70 return 0;
71}
72