1 | #include <iostream> |
2 | #include <iomanip> |
3 | #include <vector> |
4 | |
5 | #include <unordered_map> |
6 | |
7 | #include <sparsehash/dense_hash_map> |
8 | #include <sparsehash/sparse_hash_map> |
9 | |
10 | #include <Common/Stopwatch.h> |
11 | |
12 | //#define DBMS_HASH_MAP_COUNT_COLLISIONS |
13 | #define DBMS_HASH_MAP_DEBUG_RESIZES |
14 | |
15 | #include <Core/Types.h> |
16 | #include <IO/ReadBufferFromFile.h> |
17 | #include <IO/ReadHelpers.h> |
18 | #include <Compression/CompressedReadBuffer.h> |
19 | #include <common/StringRef.h> |
20 | #include <Common/HashTable/HashMap.h> |
21 | #include <Interpreters/AggregationCommon.h> |
22 | |
23 | |
24 | struct SmallStringRef |
25 | { |
26 | UInt32 size = 0; |
27 | |
28 | union |
29 | { |
30 | const char * data_big; |
31 | char data_small[12]; |
32 | }; |
33 | |
34 | bool isSmall() const { return size <= 12; } |
35 | |
36 | const char * data() const |
37 | { |
38 | return isSmall() ? data_small : data_big; |
39 | } |
40 | |
41 | SmallStringRef(const char * data_, size_t size_) |
42 | { |
43 | size = size_; |
44 | |
45 | if (isSmall()) |
46 | memcpy(data_small, data_, size_); |
47 | else |
48 | data_big = data_; |
49 | } |
50 | |
51 | SmallStringRef(const unsigned char * data_, size_t size_) : SmallStringRef(reinterpret_cast<const char *>(data_), size_) {} |
52 | explicit SmallStringRef(const std::string & s) : SmallStringRef(s.data(), s.size()) {} |
53 | SmallStringRef() {} |
54 | |
55 | std::string toString() const { return std::string(data(), size); } |
56 | }; |
57 | |
58 | |
59 | inline bool operator==(SmallStringRef lhs, SmallStringRef rhs) |
60 | { |
61 | if (lhs.size != rhs.size) |
62 | return false; |
63 | |
64 | if (lhs.size == 0) |
65 | return true; |
66 | |
67 | #ifdef __SSE2__ |
68 | return memequalSSE2Wide(lhs.data(), rhs.data(), lhs.size); |
69 | #else |
70 | return false; |
71 | #endif |
72 | } |
73 | |
74 | |
75 | namespace ZeroTraits |
76 | { |
77 | template <> |
78 | inline bool check<SmallStringRef>(SmallStringRef x) { return x.size == 0; } |
79 | |
80 | template <> |
81 | inline void set<SmallStringRef>(SmallStringRef & x) { x.size = 0; } |
82 | } |
83 | |
84 | template <> |
85 | struct DefaultHash<SmallStringRef> |
86 | { |
87 | size_t operator() (SmallStringRef x) const |
88 | { |
89 | return DefaultHash<StringRef>()(StringRef(x.data(), x.size)); |
90 | } |
91 | }; |
92 | |
93 | |
94 | using Value = UInt64; |
95 | |
96 | |
97 | int main(int argc, char ** argv) |
98 | { |
99 | if (argc < 3) |
100 | { |
101 | std::cerr << "Usage: program n m\n" ; |
102 | return 1; |
103 | } |
104 | |
105 | size_t n = atoi(argv[1]); |
106 | size_t m = atoi(argv[2]); |
107 | |
108 | DB::Arena pool; |
109 | std::vector<StringRef> data(n); |
110 | |
111 | std::cerr << "sizeof(Key) = " << sizeof(SmallStringRef) << ", sizeof(Value) = " << sizeof(Value) << std::endl; |
112 | |
113 | { |
114 | Stopwatch watch; |
115 | DB::ReadBufferFromFileDescriptor in1(STDIN_FILENO); |
116 | DB::CompressedReadBuffer in2(in1); |
117 | |
118 | std::string tmp; |
119 | for (size_t i = 0; i < n && !in2.eof(); ++i) |
120 | { |
121 | DB::readStringBinary(tmp, in2); |
122 | data[i] = StringRef(pool.insert(tmp.data(), tmp.size()), tmp.size()); |
123 | } |
124 | |
125 | watch.stop(); |
126 | std::cerr << std::fixed << std::setprecision(2) |
127 | << "Vector. Size: " << n |
128 | << ", elapsed: " << watch.elapsedSeconds() |
129 | << " (" << n / watch.elapsedSeconds() << " elem/sec.)" |
130 | << std::endl; |
131 | } |
132 | |
133 | if (!m || m == 1) |
134 | { |
135 | Stopwatch watch; |
136 | |
137 | using Map = HashMapWithSavedHash<StringRef, Value>; |
138 | |
139 | Map map; |
140 | Map::LookupResult it; |
141 | bool inserted; |
142 | |
143 | for (size_t i = 0; i < n; ++i) |
144 | { |
145 | map.emplace(data[i], it, inserted); |
146 | if (inserted) |
147 | it->getMapped() = 0; |
148 | ++it->getMapped(); |
149 | } |
150 | |
151 | watch.stop(); |
152 | std::cerr << std::fixed << std::setprecision(2) |
153 | << "HashMap (StringRef). Size: " << map.size() |
154 | << ", elapsed: " << watch.elapsedSeconds() |
155 | << " (" << n / watch.elapsedSeconds() << " elem/sec.)" |
156 | #ifdef DBMS_HASH_MAP_COUNT_COLLISIONS |
157 | << ", collisions: " << map.getCollisions() |
158 | #endif |
159 | << std::endl; |
160 | } |
161 | |
162 | if (!m || m == 2) |
163 | { |
164 | Stopwatch watch; |
165 | |
166 | using Map = HashMapWithSavedHash<SmallStringRef, Value>; |
167 | |
168 | Map map; |
169 | Map::LookupResult it; |
170 | bool inserted; |
171 | |
172 | for (size_t i = 0; i < n; ++i) |
173 | { |
174 | map.emplace(SmallStringRef(data[i].data, data[i].size), it, inserted); |
175 | if (inserted) |
176 | it->getMapped() = 0; |
177 | ++it->getMapped(); |
178 | } |
179 | |
180 | watch.stop(); |
181 | std::cerr << std::fixed << std::setprecision(2) |
182 | << "HashMap (SmallStringRef). Size: " << map.size() |
183 | << ", elapsed: " << watch.elapsedSeconds() |
184 | << " (" << n / watch.elapsedSeconds() << " elem/sec.)" |
185 | #ifdef DBMS_HASH_MAP_COUNT_COLLISIONS |
186 | << ", collisions: " << map.getCollisions() |
187 | #endif |
188 | << std::endl; |
189 | } |
190 | |
191 | return 0; |
192 | } |
193 | |