| 1 | #include <vector> |
| 2 | #include <string> |
| 3 | #include <iomanip> |
| 4 | |
| 5 | #include <Common/SipHash.h> |
| 6 | #include <IO/ReadBufferFromFileDescriptor.h> |
| 7 | #include <IO/ReadHelpers.h> |
| 8 | #include <Common/Stopwatch.h> |
| 9 | |
| 10 | |
| 11 | /** Test this way: |
| 12 | * |
| 13 | * clickhouse-client --query="SELECT SearchPhrase AS k FROM test.hits WHERE k != ''" > phrases.tsv |
| 14 | * clickhouse-client --query="SELECT URL AS k FROM test.hits" > urls.tsv |
| 15 | * clickhouse-client --query="SELECT SearchPhrase AS k FROM test.hits" > phrases_with_empty.tsv |
| 16 | * clickhouse-client --query="SELECT Title AS k FROM test.hits" > titles.tsv |
| 17 | * clickhouse-client --query="SELECT PageCharset AS k FROM test.hits" > charset.tsv |
| 18 | * |
| 19 | * for i in {1..1000}; do ./sip_hash_perf < titles.tsv 2>&1 | grep Processed | grep -oP '\d+\.\d+ rows/sec'; done | awk '{ if ($1 > x) { x = $1; print x } }' |
| 20 | */ |
| 21 | |
| 22 | |
| 23 | int main(int, char **) |
| 24 | { |
| 25 | std::vector<std::string> data; |
| 26 | DB::ReadBufferFromFileDescriptor in(STDIN_FILENO); |
| 27 | |
| 28 | std::cerr << std::fixed << std::setprecision(3); |
| 29 | |
| 30 | { |
| 31 | Stopwatch watch; |
| 32 | |
| 33 | while (!in.eof()) |
| 34 | { |
| 35 | data.emplace_back(); |
| 36 | DB::readEscapedString(data.back(), in); |
| 37 | DB::assertChar('\n', in); |
| 38 | } |
| 39 | |
| 40 | double seconds = watch.elapsedSeconds(); |
| 41 | std::cerr << "Read " |
| 42 | << data.size() << " rows, " |
| 43 | << (in.count() / 1048576.0) << " MiB " |
| 44 | << " in " << seconds << " sec., " |
| 45 | << (data.size() / seconds) << " rows/sec., " |
| 46 | << (in.count() / 1048576.0 / seconds) << " MiB/sec.\n" ; |
| 47 | } |
| 48 | |
| 49 | { |
| 50 | size_t res = 0; |
| 51 | Stopwatch watch; |
| 52 | |
| 53 | for (const auto & s : data) |
| 54 | { |
| 55 | SipHash hash; |
| 56 | hash.update(s.data(), s.size()); |
| 57 | res += hash.get64(); |
| 58 | } |
| 59 | |
| 60 | double seconds = watch.elapsedSeconds(); |
| 61 | std::cerr << "Processed " |
| 62 | << data.size() << " rows, " |
| 63 | << (in.count() / 1048576.0) << " MiB " |
| 64 | << " in " << seconds << " sec., " |
| 65 | << (data.size() / seconds) << " rows/sec., " |
| 66 | << (in.count() / 1048576.0 / seconds) << " MiB/sec. " |
| 67 | << "(res = " << res << ")\n" ; |
| 68 | } |
| 69 | |
| 70 | return 0; |
| 71 | } |
| 72 | |