1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include <cmath> |
19 | #include <cstdint> |
20 | #include <limits> |
21 | #include <memory> |
22 | #include <random> |
23 | #include <string> |
24 | #include <unordered_map> |
25 | #include <unordered_set> |
26 | #include <vector> |
27 | |
28 | #include <gtest/gtest.h> |
29 | |
30 | #include "arrow/test-util.h" |
31 | #include "arrow/util/bit-util.h" |
32 | #include "arrow/util/hashing.h" |
33 | #include "arrow/util/logging.h" |
34 | |
35 | namespace arrow { |
36 | namespace internal { |
37 | |
38 | template <typename Integer> |
39 | static std::unordered_set<Integer> MakeDistinctIntegers(int32_t n_values) { |
40 | std::default_random_engine gen(42); |
41 | std::uniform_int_distribution<Integer> values_dist(0, |
42 | std::numeric_limits<Integer>::max()); |
43 | |
44 | std::unordered_set<Integer> values; |
45 | values.reserve(n_values); |
46 | |
47 | while (values.size() < static_cast<uint32_t>(n_values)) { |
48 | values.insert(static_cast<Integer>(values_dist(gen))); |
49 | } |
50 | return values; |
51 | } |
52 | |
53 | template <typename Integer> |
54 | static std::unordered_set<Integer> MakeSequentialIntegers(int32_t n_values) { |
55 | std::unordered_set<Integer> values; |
56 | values.reserve(n_values); |
57 | |
58 | for (int32_t i = 0; i < n_values; ++i) { |
59 | values.insert(static_cast<Integer>(i)); |
60 | } |
61 | DCHECK_EQ(values.size(), static_cast<uint32_t>(n_values)); |
62 | return values; |
63 | } |
64 | |
65 | static std::unordered_set<std::string> MakeDistinctStrings(int32_t n_values) { |
66 | std::unordered_set<std::string> values; |
67 | values.reserve(n_values); |
68 | |
69 | // Generate strings between 0 and 24 bytes, with ASCII characters |
70 | std::default_random_engine gen(42); |
71 | std::uniform_int_distribution<int32_t> length_dist(0, 24); |
72 | std::uniform_int_distribution<uint32_t> char_dist('0', 'z'); |
73 | |
74 | while (values.size() < static_cast<uint32_t>(n_values)) { |
75 | auto length = length_dist(gen); |
76 | std::string s(length, 'X'); |
77 | for (int32_t i = 0; i < length; ++i) { |
78 | s[i] = static_cast<uint8_t>(char_dist(gen)); |
79 | } |
80 | values.insert(std::move(s)); |
81 | } |
82 | return values; |
83 | } |
84 | |
85 | template <typename T> |
86 | static void CheckScalarHashQuality(const std::unordered_set<T>& distinct_values) { |
87 | std::unordered_set<hash_t> hashes; |
88 | for (const auto v : distinct_values) { |
89 | hashes.insert(ScalarHelper<T, 0>::ComputeHash(v)); |
90 | hashes.insert(ScalarHelper<T, 1>::ComputeHash(v)); |
91 | } |
92 | ASSERT_GE(static_cast<double>(hashes.size()), |
93 | 0.96 * static_cast<double>(2 * distinct_values.size())); |
94 | } |
95 | |
96 | TEST(HashingQuality, Int64) { |
97 | #ifdef ARROW_VALGRIND |
98 | const int32_t n_values = 500; |
99 | #else |
100 | const int32_t n_values = 10000; |
101 | #endif |
102 | { |
103 | const auto values = MakeDistinctIntegers<int64_t>(n_values); |
104 | CheckScalarHashQuality<int64_t>(values); |
105 | } |
106 | { |
107 | const auto values = MakeSequentialIntegers<int64_t>(n_values); |
108 | CheckScalarHashQuality<int64_t>(values); |
109 | } |
110 | } |
111 | |
112 | TEST(HashingQuality, Strings) { |
113 | #ifdef ARROW_VALGRIND |
114 | const int32_t n_values = 500; |
115 | #else |
116 | const int32_t n_values = 10000; |
117 | #endif |
118 | const auto values = MakeDistinctStrings(n_values); |
119 | |
120 | std::unordered_set<hash_t> hashes; |
121 | for (const auto& v : values) { |
122 | hashes.insert(ComputeStringHash<0>(v.data(), static_cast<int64_t>(v.size()))); |
123 | hashes.insert(ComputeStringHash<1>(v.data(), static_cast<int64_t>(v.size()))); |
124 | } |
125 | ASSERT_GE(static_cast<double>(hashes.size()), |
126 | 0.96 * static_cast<double>(2 * values.size())); |
127 | } |
128 | |
129 | TEST(ScalarMemoTable, Int64) { |
130 | const int64_t A = 1234, B = 0, C = -98765321, D = 12345678901234LL, E = -1, F = 1, |
131 | G = 9223372036854775807LL, H = -9223372036854775807LL - 1; |
132 | |
133 | ScalarMemoTable<int64_t> table(0); |
134 | ASSERT_EQ(table.size(), 0); |
135 | ASSERT_EQ(table.Get(A), -1); |
136 | ASSERT_EQ(table.GetOrInsert(A), 0); |
137 | ASSERT_EQ(table.Get(B), -1); |
138 | ASSERT_EQ(table.GetOrInsert(B), 1); |
139 | ASSERT_EQ(table.GetOrInsert(C), 2); |
140 | ASSERT_EQ(table.GetOrInsert(D), 3); |
141 | ASSERT_EQ(table.GetOrInsert(E), 4); |
142 | |
143 | ASSERT_EQ(table.Get(A), 0); |
144 | ASSERT_EQ(table.GetOrInsert(A), 0); |
145 | ASSERT_EQ(table.Get(E), 4); |
146 | ASSERT_EQ(table.GetOrInsert(E), 4); |
147 | |
148 | ASSERT_EQ(table.GetOrInsert(F), 5); |
149 | ASSERT_EQ(table.GetOrInsert(G), 6); |
150 | ASSERT_EQ(table.GetOrInsert(H), 7); |
151 | |
152 | ASSERT_EQ(table.GetOrInsert(G), 6); |
153 | ASSERT_EQ(table.GetOrInsert(F), 5); |
154 | ASSERT_EQ(table.GetOrInsert(E), 4); |
155 | ASSERT_EQ(table.GetOrInsert(D), 3); |
156 | ASSERT_EQ(table.GetOrInsert(C), 2); |
157 | ASSERT_EQ(table.GetOrInsert(B), 1); |
158 | ASSERT_EQ(table.GetOrInsert(A), 0); |
159 | |
160 | ASSERT_EQ(table.size(), 8); |
161 | { |
162 | std::vector<int64_t> expected({A, B, C, D, E, F, G, H}); |
163 | std::vector<int64_t> values(expected.size()); |
164 | table.CopyValues(values.data()); |
165 | ASSERT_EQ(values, expected); |
166 | } |
167 | { |
168 | std::vector<int64_t> expected({D, E, F, G, H}); |
169 | std::vector<int64_t> values(expected.size()); |
170 | table.CopyValues(3 /* start offset */, values.data()); |
171 | ASSERT_EQ(values, expected); |
172 | } |
173 | } |
174 | |
175 | TEST(ScalarMemoTable, UInt16) { |
176 | const uint16_t A = 1234, B = 0, C = 65535, D = 32767, E = 1; |
177 | |
178 | ScalarMemoTable<uint16_t> table(0); |
179 | ASSERT_EQ(table.size(), 0); |
180 | ASSERT_EQ(table.Get(A), -1); |
181 | ASSERT_EQ(table.GetOrInsert(A), 0); |
182 | ASSERT_EQ(table.Get(B), -1); |
183 | ASSERT_EQ(table.GetOrInsert(B), 1); |
184 | ASSERT_EQ(table.GetOrInsert(C), 2); |
185 | ASSERT_EQ(table.GetOrInsert(D), 3); |
186 | ASSERT_EQ(table.GetOrInsert(E), 4); |
187 | |
188 | ASSERT_EQ(table.Get(A), 0); |
189 | ASSERT_EQ(table.GetOrInsert(A), 0); |
190 | ASSERT_EQ(table.GetOrInsert(B), 1); |
191 | ASSERT_EQ(table.GetOrInsert(C), 2); |
192 | ASSERT_EQ(table.GetOrInsert(D), 3); |
193 | ASSERT_EQ(table.Get(E), 4); |
194 | ASSERT_EQ(table.GetOrInsert(E), 4); |
195 | |
196 | ASSERT_EQ(table.size(), 5); |
197 | std::vector<uint16_t> expected({A, B, C, D, E}); |
198 | std::vector<uint16_t> values(table.size()); |
199 | table.CopyValues(values.data()); |
200 | ASSERT_EQ(values, expected); |
201 | } |
202 | |
203 | TEST(SmallScalarMemoTable, Int8) { |
204 | const int8_t A = 1, B = 0, C = -1, D = -128, E = 127; |
205 | |
206 | SmallScalarMemoTable<int8_t> table(0); |
207 | ASSERT_EQ(table.size(), 0); |
208 | ASSERT_EQ(table.Get(A), -1); |
209 | ASSERT_EQ(table.GetOrInsert(A), 0); |
210 | ASSERT_EQ(table.Get(B), -1); |
211 | ASSERT_EQ(table.GetOrInsert(B), 1); |
212 | ASSERT_EQ(table.GetOrInsert(C), 2); |
213 | ASSERT_EQ(table.GetOrInsert(D), 3); |
214 | ASSERT_EQ(table.GetOrInsert(E), 4); |
215 | |
216 | ASSERT_EQ(table.Get(A), 0); |
217 | ASSERT_EQ(table.GetOrInsert(A), 0); |
218 | ASSERT_EQ(table.GetOrInsert(B), 1); |
219 | ASSERT_EQ(table.GetOrInsert(C), 2); |
220 | ASSERT_EQ(table.GetOrInsert(D), 3); |
221 | ASSERT_EQ(table.Get(E), 4); |
222 | ASSERT_EQ(table.GetOrInsert(E), 4); |
223 | |
224 | ASSERT_EQ(table.size(), 5); |
225 | std::vector<int8_t> expected({A, B, C, D, E}); |
226 | std::vector<int8_t> values(table.size()); |
227 | table.CopyValues(values.data()); |
228 | ASSERT_EQ(values, expected); |
229 | } |
230 | |
231 | TEST(SmallScalarMemoTable, Bool) { |
232 | SmallScalarMemoTable<bool> table(0); |
233 | ASSERT_EQ(table.size(), 0); |
234 | ASSERT_EQ(table.Get(true), -1); |
235 | ASSERT_EQ(table.GetOrInsert(true), 0); |
236 | ASSERT_EQ(table.Get(false), -1); |
237 | ASSERT_EQ(table.GetOrInsert(false), 1); |
238 | |
239 | ASSERT_EQ(table.Get(true), 0); |
240 | ASSERT_EQ(table.GetOrInsert(true), 0); |
241 | ASSERT_EQ(table.Get(false), 1); |
242 | ASSERT_EQ(table.GetOrInsert(false), 1); |
243 | |
244 | ASSERT_EQ(table.size(), 2); |
245 | std::vector<bool> expected({true, false}); |
246 | ASSERT_EQ(table.values(), expected); |
247 | // NOTE std::vector<bool> doesn't have a data() method |
248 | } |
249 | |
250 | TEST(ScalarMemoTable, Float64) { |
251 | const double A = 0.0, B = 1.5, C = -0.0, D = std::numeric_limits<double>::infinity(), |
252 | E = -D, F = std::nan("" ); |
253 | |
254 | ScalarMemoTable<double> table(0); |
255 | ASSERT_EQ(table.size(), 0); |
256 | ASSERT_EQ(table.Get(A), -1); |
257 | ASSERT_EQ(table.GetOrInsert(A), 0); |
258 | ASSERT_EQ(table.Get(B), -1); |
259 | ASSERT_EQ(table.GetOrInsert(B), 1); |
260 | ASSERT_EQ(table.GetOrInsert(C), 2); |
261 | ASSERT_EQ(table.GetOrInsert(D), 3); |
262 | ASSERT_EQ(table.GetOrInsert(E), 4); |
263 | ASSERT_EQ(table.GetOrInsert(F), 5); |
264 | |
265 | ASSERT_EQ(table.Get(A), 0); |
266 | ASSERT_EQ(table.GetOrInsert(A), 0); |
267 | ASSERT_EQ(table.GetOrInsert(B), 1); |
268 | ASSERT_EQ(table.GetOrInsert(C), 2); |
269 | ASSERT_EQ(table.GetOrInsert(D), 3); |
270 | ASSERT_EQ(table.Get(E), 4); |
271 | ASSERT_EQ(table.GetOrInsert(E), 4); |
272 | ASSERT_EQ(table.Get(F), 5); |
273 | ASSERT_EQ(table.GetOrInsert(F), 5); |
274 | |
275 | ASSERT_EQ(table.size(), 6); |
276 | std::vector<double> expected({A, B, C, D, E, F}); |
277 | std::vector<double> values(table.size()); |
278 | table.CopyValues(values.data()); |
279 | for (uint32_t i = 0; i < expected.size(); ++i) { |
280 | auto u = expected[i]; |
281 | auto v = values[i]; |
282 | if (std::isnan(u)) { |
283 | ASSERT_TRUE(std::isnan(v)); |
284 | } else { |
285 | ASSERT_EQ(u, v); |
286 | } |
287 | } |
288 | } |
289 | |
290 | TEST(ScalarMemoTable, StressInt64) { |
291 | std::default_random_engine gen(42); |
292 | std::uniform_int_distribution<int64_t> value_dist(-50, 50); |
293 | #ifdef ARROW_VALGRIND |
294 | const int32_t n_repeats = 500; |
295 | #else |
296 | const int32_t n_repeats = 10000; |
297 | #endif |
298 | |
299 | ScalarMemoTable<int64_t> table(0); |
300 | std::unordered_map<int64_t, int32_t> map; |
301 | |
302 | for (int32_t i = 0; i < n_repeats; ++i) { |
303 | int64_t value = value_dist(gen); |
304 | int32_t expected; |
305 | auto it = map.find(value); |
306 | if (it == map.end()) { |
307 | expected = static_cast<int32_t>(map.size()); |
308 | map[value] = expected; |
309 | } else { |
310 | expected = it->second; |
311 | } |
312 | ASSERT_EQ(table.GetOrInsert(value), expected); |
313 | } |
314 | ASSERT_EQ(table.size(), map.size()); |
315 | } |
316 | |
317 | TEST(BinaryMemoTable, Basics) { |
318 | std::string A = "" , B = "a" , C = "foo" , D = "bar" , E, F; |
319 | E += '\0'; |
320 | F += '\0'; |
321 | F += "trailing" ; |
322 | |
323 | BinaryMemoTable table(0); |
324 | ASSERT_EQ(table.size(), 0); |
325 | ASSERT_EQ(table.Get(A), -1); |
326 | ASSERT_EQ(table.GetOrInsert(A), 0); |
327 | ASSERT_EQ(table.Get(B), -1); |
328 | ASSERT_EQ(table.GetOrInsert(B), 1); |
329 | ASSERT_EQ(table.GetOrInsert(C), 2); |
330 | ASSERT_EQ(table.GetOrInsert(D), 3); |
331 | ASSERT_EQ(table.GetOrInsert(E), 4); |
332 | ASSERT_EQ(table.GetOrInsert(F), 5); |
333 | |
334 | ASSERT_EQ(table.GetOrInsert(A), 0); |
335 | ASSERT_EQ(table.GetOrInsert(B), 1); |
336 | ASSERT_EQ(table.GetOrInsert(C), 2); |
337 | ASSERT_EQ(table.GetOrInsert(D), 3); |
338 | ASSERT_EQ(table.GetOrInsert(E), 4); |
339 | ASSERT_EQ(table.GetOrInsert(F), 5); |
340 | |
341 | ASSERT_EQ(table.size(), 6); |
342 | ASSERT_EQ(table.values_size(), 17); |
343 | |
344 | { |
345 | std::vector<int8_t> expected({0, 0, 1, 4, 7, 8, 17}); |
346 | std::vector<int8_t> offsets(expected.size()); |
347 | table.CopyOffsets(offsets.data()); |
348 | ASSERT_EQ(offsets, expected); |
349 | |
350 | std::string expected_values; |
351 | expected_values += "afoobar" ; |
352 | expected_values += '\0'; |
353 | expected_values += '\0'; |
354 | expected_values += "trailing" ; |
355 | std::string values(17, 'X'); |
356 | table.CopyValues(reinterpret_cast<uint8_t*>(&values[0])); |
357 | ASSERT_EQ(values, expected_values); |
358 | } |
359 | { |
360 | std::vector<int8_t> expected({0, 1, 10}); |
361 | std::vector<int8_t> offsets(expected.size()); |
362 | table.CopyOffsets(4 /* start offset */, offsets.data()); |
363 | ASSERT_EQ(offsets, expected); |
364 | |
365 | std::string expected_values; |
366 | expected_values += '\0'; |
367 | expected_values += '\0'; |
368 | expected_values += "trailing" ; |
369 | std::string values(10, 'X'); |
370 | table.CopyValues(4 /* start offset */, reinterpret_cast<uint8_t*>(&values[0])); |
371 | ASSERT_EQ(values, expected_values); |
372 | } |
373 | { |
374 | std::vector<std::string> expected({B, C, D, E, F}); |
375 | std::vector<std::string> actual; |
376 | table.VisitValues(1 /* start offset */, [&](const util::string_view& v) { |
377 | actual.emplace_back(v.data(), v.length()); |
378 | }); |
379 | ASSERT_EQ(actual, expected); |
380 | } |
381 | } |
382 | |
383 | TEST(BinaryMemoTable, Stress) { |
384 | #ifdef ARROW_VALGRIND |
385 | const int32_t n_values = 20; |
386 | const int32_t n_repeats = 20; |
387 | #else |
388 | const int32_t n_values = 100; |
389 | const int32_t n_repeats = 100; |
390 | #endif |
391 | |
392 | const auto values = MakeDistinctStrings(n_values); |
393 | |
394 | BinaryMemoTable table(0); |
395 | std::unordered_map<std::string, int32_t> map; |
396 | |
397 | for (int32_t i = 0; i < n_repeats; ++i) { |
398 | for (const auto& value : values) { |
399 | int32_t expected; |
400 | auto it = map.find(value); |
401 | if (it == map.end()) { |
402 | expected = static_cast<int32_t>(map.size()); |
403 | map[value] = expected; |
404 | } else { |
405 | expected = it->second; |
406 | } |
407 | ASSERT_EQ(table.GetOrInsert(value), expected); |
408 | } |
409 | } |
410 | ASSERT_EQ(table.size(), map.size()); |
411 | } |
412 | |
413 | } // namespace internal |
414 | } // namespace arrow |
415 | |