1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include <cmath>
19#include <cstdint>
20#include <limits>
21#include <memory>
22#include <random>
23#include <string>
24#include <unordered_map>
25#include <unordered_set>
26#include <vector>
27
28#include <gtest/gtest.h>
29
30#include "arrow/test-util.h"
31#include "arrow/util/bit-util.h"
32#include "arrow/util/hashing.h"
33#include "arrow/util/logging.h"
34
35namespace arrow {
36namespace internal {
37
38template <typename Integer>
39static std::unordered_set<Integer> MakeDistinctIntegers(int32_t n_values) {
40 std::default_random_engine gen(42);
41 std::uniform_int_distribution<Integer> values_dist(0,
42 std::numeric_limits<Integer>::max());
43
44 std::unordered_set<Integer> values;
45 values.reserve(n_values);
46
47 while (values.size() < static_cast<uint32_t>(n_values)) {
48 values.insert(static_cast<Integer>(values_dist(gen)));
49 }
50 return values;
51}
52
53template <typename Integer>
54static std::unordered_set<Integer> MakeSequentialIntegers(int32_t n_values) {
55 std::unordered_set<Integer> values;
56 values.reserve(n_values);
57
58 for (int32_t i = 0; i < n_values; ++i) {
59 values.insert(static_cast<Integer>(i));
60 }
61 DCHECK_EQ(values.size(), static_cast<uint32_t>(n_values));
62 return values;
63}
64
65static std::unordered_set<std::string> MakeDistinctStrings(int32_t n_values) {
66 std::unordered_set<std::string> values;
67 values.reserve(n_values);
68
69 // Generate strings between 0 and 24 bytes, with ASCII characters
70 std::default_random_engine gen(42);
71 std::uniform_int_distribution<int32_t> length_dist(0, 24);
72 std::uniform_int_distribution<uint32_t> char_dist('0', 'z');
73
74 while (values.size() < static_cast<uint32_t>(n_values)) {
75 auto length = length_dist(gen);
76 std::string s(length, 'X');
77 for (int32_t i = 0; i < length; ++i) {
78 s[i] = static_cast<uint8_t>(char_dist(gen));
79 }
80 values.insert(std::move(s));
81 }
82 return values;
83}
84
85template <typename T>
86static void CheckScalarHashQuality(const std::unordered_set<T>& distinct_values) {
87 std::unordered_set<hash_t> hashes;
88 for (const auto v : distinct_values) {
89 hashes.insert(ScalarHelper<T, 0>::ComputeHash(v));
90 hashes.insert(ScalarHelper<T, 1>::ComputeHash(v));
91 }
92 ASSERT_GE(static_cast<double>(hashes.size()),
93 0.96 * static_cast<double>(2 * distinct_values.size()));
94}
95
96TEST(HashingQuality, Int64) {
97#ifdef ARROW_VALGRIND
98 const int32_t n_values = 500;
99#else
100 const int32_t n_values = 10000;
101#endif
102 {
103 const auto values = MakeDistinctIntegers<int64_t>(n_values);
104 CheckScalarHashQuality<int64_t>(values);
105 }
106 {
107 const auto values = MakeSequentialIntegers<int64_t>(n_values);
108 CheckScalarHashQuality<int64_t>(values);
109 }
110}
111
112TEST(HashingQuality, Strings) {
113#ifdef ARROW_VALGRIND
114 const int32_t n_values = 500;
115#else
116 const int32_t n_values = 10000;
117#endif
118 const auto values = MakeDistinctStrings(n_values);
119
120 std::unordered_set<hash_t> hashes;
121 for (const auto& v : values) {
122 hashes.insert(ComputeStringHash<0>(v.data(), static_cast<int64_t>(v.size())));
123 hashes.insert(ComputeStringHash<1>(v.data(), static_cast<int64_t>(v.size())));
124 }
125 ASSERT_GE(static_cast<double>(hashes.size()),
126 0.96 * static_cast<double>(2 * values.size()));
127}
128
129TEST(ScalarMemoTable, Int64) {
130 const int64_t A = 1234, B = 0, C = -98765321, D = 12345678901234LL, E = -1, F = 1,
131 G = 9223372036854775807LL, H = -9223372036854775807LL - 1;
132
133 ScalarMemoTable<int64_t> table(0);
134 ASSERT_EQ(table.size(), 0);
135 ASSERT_EQ(table.Get(A), -1);
136 ASSERT_EQ(table.GetOrInsert(A), 0);
137 ASSERT_EQ(table.Get(B), -1);
138 ASSERT_EQ(table.GetOrInsert(B), 1);
139 ASSERT_EQ(table.GetOrInsert(C), 2);
140 ASSERT_EQ(table.GetOrInsert(D), 3);
141 ASSERT_EQ(table.GetOrInsert(E), 4);
142
143 ASSERT_EQ(table.Get(A), 0);
144 ASSERT_EQ(table.GetOrInsert(A), 0);
145 ASSERT_EQ(table.Get(E), 4);
146 ASSERT_EQ(table.GetOrInsert(E), 4);
147
148 ASSERT_EQ(table.GetOrInsert(F), 5);
149 ASSERT_EQ(table.GetOrInsert(G), 6);
150 ASSERT_EQ(table.GetOrInsert(H), 7);
151
152 ASSERT_EQ(table.GetOrInsert(G), 6);
153 ASSERT_EQ(table.GetOrInsert(F), 5);
154 ASSERT_EQ(table.GetOrInsert(E), 4);
155 ASSERT_EQ(table.GetOrInsert(D), 3);
156 ASSERT_EQ(table.GetOrInsert(C), 2);
157 ASSERT_EQ(table.GetOrInsert(B), 1);
158 ASSERT_EQ(table.GetOrInsert(A), 0);
159
160 ASSERT_EQ(table.size(), 8);
161 {
162 std::vector<int64_t> expected({A, B, C, D, E, F, G, H});
163 std::vector<int64_t> values(expected.size());
164 table.CopyValues(values.data());
165 ASSERT_EQ(values, expected);
166 }
167 {
168 std::vector<int64_t> expected({D, E, F, G, H});
169 std::vector<int64_t> values(expected.size());
170 table.CopyValues(3 /* start offset */, values.data());
171 ASSERT_EQ(values, expected);
172 }
173}
174
175TEST(ScalarMemoTable, UInt16) {
176 const uint16_t A = 1234, B = 0, C = 65535, D = 32767, E = 1;
177
178 ScalarMemoTable<uint16_t> table(0);
179 ASSERT_EQ(table.size(), 0);
180 ASSERT_EQ(table.Get(A), -1);
181 ASSERT_EQ(table.GetOrInsert(A), 0);
182 ASSERT_EQ(table.Get(B), -1);
183 ASSERT_EQ(table.GetOrInsert(B), 1);
184 ASSERT_EQ(table.GetOrInsert(C), 2);
185 ASSERT_EQ(table.GetOrInsert(D), 3);
186 ASSERT_EQ(table.GetOrInsert(E), 4);
187
188 ASSERT_EQ(table.Get(A), 0);
189 ASSERT_EQ(table.GetOrInsert(A), 0);
190 ASSERT_EQ(table.GetOrInsert(B), 1);
191 ASSERT_EQ(table.GetOrInsert(C), 2);
192 ASSERT_EQ(table.GetOrInsert(D), 3);
193 ASSERT_EQ(table.Get(E), 4);
194 ASSERT_EQ(table.GetOrInsert(E), 4);
195
196 ASSERT_EQ(table.size(), 5);
197 std::vector<uint16_t> expected({A, B, C, D, E});
198 std::vector<uint16_t> values(table.size());
199 table.CopyValues(values.data());
200 ASSERT_EQ(values, expected);
201}
202
203TEST(SmallScalarMemoTable, Int8) {
204 const int8_t A = 1, B = 0, C = -1, D = -128, E = 127;
205
206 SmallScalarMemoTable<int8_t> table(0);
207 ASSERT_EQ(table.size(), 0);
208 ASSERT_EQ(table.Get(A), -1);
209 ASSERT_EQ(table.GetOrInsert(A), 0);
210 ASSERT_EQ(table.Get(B), -1);
211 ASSERT_EQ(table.GetOrInsert(B), 1);
212 ASSERT_EQ(table.GetOrInsert(C), 2);
213 ASSERT_EQ(table.GetOrInsert(D), 3);
214 ASSERT_EQ(table.GetOrInsert(E), 4);
215
216 ASSERT_EQ(table.Get(A), 0);
217 ASSERT_EQ(table.GetOrInsert(A), 0);
218 ASSERT_EQ(table.GetOrInsert(B), 1);
219 ASSERT_EQ(table.GetOrInsert(C), 2);
220 ASSERT_EQ(table.GetOrInsert(D), 3);
221 ASSERT_EQ(table.Get(E), 4);
222 ASSERT_EQ(table.GetOrInsert(E), 4);
223
224 ASSERT_EQ(table.size(), 5);
225 std::vector<int8_t> expected({A, B, C, D, E});
226 std::vector<int8_t> values(table.size());
227 table.CopyValues(values.data());
228 ASSERT_EQ(values, expected);
229}
230
231TEST(SmallScalarMemoTable, Bool) {
232 SmallScalarMemoTable<bool> table(0);
233 ASSERT_EQ(table.size(), 0);
234 ASSERT_EQ(table.Get(true), -1);
235 ASSERT_EQ(table.GetOrInsert(true), 0);
236 ASSERT_EQ(table.Get(false), -1);
237 ASSERT_EQ(table.GetOrInsert(false), 1);
238
239 ASSERT_EQ(table.Get(true), 0);
240 ASSERT_EQ(table.GetOrInsert(true), 0);
241 ASSERT_EQ(table.Get(false), 1);
242 ASSERT_EQ(table.GetOrInsert(false), 1);
243
244 ASSERT_EQ(table.size(), 2);
245 std::vector<bool> expected({true, false});
246 ASSERT_EQ(table.values(), expected);
247 // NOTE std::vector<bool> doesn't have a data() method
248}
249
250TEST(ScalarMemoTable, Float64) {
251 const double A = 0.0, B = 1.5, C = -0.0, D = std::numeric_limits<double>::infinity(),
252 E = -D, F = std::nan("");
253
254 ScalarMemoTable<double> table(0);
255 ASSERT_EQ(table.size(), 0);
256 ASSERT_EQ(table.Get(A), -1);
257 ASSERT_EQ(table.GetOrInsert(A), 0);
258 ASSERT_EQ(table.Get(B), -1);
259 ASSERT_EQ(table.GetOrInsert(B), 1);
260 ASSERT_EQ(table.GetOrInsert(C), 2);
261 ASSERT_EQ(table.GetOrInsert(D), 3);
262 ASSERT_EQ(table.GetOrInsert(E), 4);
263 ASSERT_EQ(table.GetOrInsert(F), 5);
264
265 ASSERT_EQ(table.Get(A), 0);
266 ASSERT_EQ(table.GetOrInsert(A), 0);
267 ASSERT_EQ(table.GetOrInsert(B), 1);
268 ASSERT_EQ(table.GetOrInsert(C), 2);
269 ASSERT_EQ(table.GetOrInsert(D), 3);
270 ASSERT_EQ(table.Get(E), 4);
271 ASSERT_EQ(table.GetOrInsert(E), 4);
272 ASSERT_EQ(table.Get(F), 5);
273 ASSERT_EQ(table.GetOrInsert(F), 5);
274
275 ASSERT_EQ(table.size(), 6);
276 std::vector<double> expected({A, B, C, D, E, F});
277 std::vector<double> values(table.size());
278 table.CopyValues(values.data());
279 for (uint32_t i = 0; i < expected.size(); ++i) {
280 auto u = expected[i];
281 auto v = values[i];
282 if (std::isnan(u)) {
283 ASSERT_TRUE(std::isnan(v));
284 } else {
285 ASSERT_EQ(u, v);
286 }
287 }
288}
289
290TEST(ScalarMemoTable, StressInt64) {
291 std::default_random_engine gen(42);
292 std::uniform_int_distribution<int64_t> value_dist(-50, 50);
293#ifdef ARROW_VALGRIND
294 const int32_t n_repeats = 500;
295#else
296 const int32_t n_repeats = 10000;
297#endif
298
299 ScalarMemoTable<int64_t> table(0);
300 std::unordered_map<int64_t, int32_t> map;
301
302 for (int32_t i = 0; i < n_repeats; ++i) {
303 int64_t value = value_dist(gen);
304 int32_t expected;
305 auto it = map.find(value);
306 if (it == map.end()) {
307 expected = static_cast<int32_t>(map.size());
308 map[value] = expected;
309 } else {
310 expected = it->second;
311 }
312 ASSERT_EQ(table.GetOrInsert(value), expected);
313 }
314 ASSERT_EQ(table.size(), map.size());
315}
316
317TEST(BinaryMemoTable, Basics) {
318 std::string A = "", B = "a", C = "foo", D = "bar", E, F;
319 E += '\0';
320 F += '\0';
321 F += "trailing";
322
323 BinaryMemoTable table(0);
324 ASSERT_EQ(table.size(), 0);
325 ASSERT_EQ(table.Get(A), -1);
326 ASSERT_EQ(table.GetOrInsert(A), 0);
327 ASSERT_EQ(table.Get(B), -1);
328 ASSERT_EQ(table.GetOrInsert(B), 1);
329 ASSERT_EQ(table.GetOrInsert(C), 2);
330 ASSERT_EQ(table.GetOrInsert(D), 3);
331 ASSERT_EQ(table.GetOrInsert(E), 4);
332 ASSERT_EQ(table.GetOrInsert(F), 5);
333
334 ASSERT_EQ(table.GetOrInsert(A), 0);
335 ASSERT_EQ(table.GetOrInsert(B), 1);
336 ASSERT_EQ(table.GetOrInsert(C), 2);
337 ASSERT_EQ(table.GetOrInsert(D), 3);
338 ASSERT_EQ(table.GetOrInsert(E), 4);
339 ASSERT_EQ(table.GetOrInsert(F), 5);
340
341 ASSERT_EQ(table.size(), 6);
342 ASSERT_EQ(table.values_size(), 17);
343
344 {
345 std::vector<int8_t> expected({0, 0, 1, 4, 7, 8, 17});
346 std::vector<int8_t> offsets(expected.size());
347 table.CopyOffsets(offsets.data());
348 ASSERT_EQ(offsets, expected);
349
350 std::string expected_values;
351 expected_values += "afoobar";
352 expected_values += '\0';
353 expected_values += '\0';
354 expected_values += "trailing";
355 std::string values(17, 'X');
356 table.CopyValues(reinterpret_cast<uint8_t*>(&values[0]));
357 ASSERT_EQ(values, expected_values);
358 }
359 {
360 std::vector<int8_t> expected({0, 1, 10});
361 std::vector<int8_t> offsets(expected.size());
362 table.CopyOffsets(4 /* start offset */, offsets.data());
363 ASSERT_EQ(offsets, expected);
364
365 std::string expected_values;
366 expected_values += '\0';
367 expected_values += '\0';
368 expected_values += "trailing";
369 std::string values(10, 'X');
370 table.CopyValues(4 /* start offset */, reinterpret_cast<uint8_t*>(&values[0]));
371 ASSERT_EQ(values, expected_values);
372 }
373 {
374 std::vector<std::string> expected({B, C, D, E, F});
375 std::vector<std::string> actual;
376 table.VisitValues(1 /* start offset */, [&](const util::string_view& v) {
377 actual.emplace_back(v.data(), v.length());
378 });
379 ASSERT_EQ(actual, expected);
380 }
381}
382
383TEST(BinaryMemoTable, Stress) {
384#ifdef ARROW_VALGRIND
385 const int32_t n_values = 20;
386 const int32_t n_repeats = 20;
387#else
388 const int32_t n_values = 100;
389 const int32_t n_repeats = 100;
390#endif
391
392 const auto values = MakeDistinctStrings(n_values);
393
394 BinaryMemoTable table(0);
395 std::unordered_map<std::string, int32_t> map;
396
397 for (int32_t i = 0; i < n_repeats; ++i) {
398 for (const auto& value : values) {
399 int32_t expected;
400 auto it = map.find(value);
401 if (it == map.end()) {
402 expected = static_cast<int32_t>(map.size());
403 map[value] = expected;
404 } else {
405 expected = it->second;
406 }
407 ASSERT_EQ(table.GetOrInsert(value), expected);
408 }
409 }
410 ASSERT_EQ(table.size(), map.size());
411}
412
413} // namespace internal
414} // namespace arrow
415