1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include <algorithm> |
19 | #include <cstdint> |
20 | #include <cstring> |
21 | #include <string> |
22 | #include <unordered_map> |
23 | #include <utility> |
24 | #include <vector> |
25 | |
26 | #include <gtest/gtest.h> |
27 | |
28 | #include "arrow/test-util.h" |
29 | #include "arrow/util/trie.h" |
30 | |
31 | namespace arrow { |
32 | namespace internal { |
33 | |
34 | TEST(SmallString, Basics) { |
35 | using SS = SmallString<5>; |
36 | { |
37 | SS s; |
38 | ASSERT_EQ(s.length(), 0); |
39 | ASSERT_EQ(util::string_view(s), util::string_view("" )); |
40 | ASSERT_EQ(s, "" ); |
41 | ASSERT_NE(s, "x" ); |
42 | ASSERT_EQ(sizeof(s), 6); |
43 | } |
44 | { |
45 | SS s("abc" ); |
46 | ASSERT_EQ(s.length(), 3); |
47 | ASSERT_EQ(util::string_view(s), util::string_view("abc" )); |
48 | ASSERT_EQ(std::memcmp(s.data(), "abc" , 3), 0); |
49 | ASSERT_EQ(s, "abc" ); |
50 | ASSERT_NE(s, "ab" ); |
51 | } |
52 | } |
53 | |
54 | TEST(SmallString, Assign) { |
55 | using SS = SmallString<5>; |
56 | auto s = SS(); |
57 | |
58 | s = util::string_view("abc" ); |
59 | ASSERT_EQ(s.length(), 3); |
60 | ASSERT_EQ(util::string_view(s), util::string_view("abc" )); |
61 | ASSERT_EQ(std::memcmp(s.data(), "abc" , 3), 0); |
62 | ASSERT_EQ(s, "abc" ); |
63 | ASSERT_NE(s, "ab" ); |
64 | |
65 | s = std::string("ghijk" ); |
66 | ASSERT_EQ(s.length(), 5); |
67 | ASSERT_EQ(util::string_view(s), util::string_view("ghijk" )); |
68 | ASSERT_EQ(std::memcmp(s.data(), "ghijk" , 5), 0); |
69 | ASSERT_EQ(s, "ghijk" ); |
70 | ASSERT_NE(s, "" ); |
71 | |
72 | s = SS("xy" ); |
73 | ASSERT_EQ(s.length(), 2); |
74 | ASSERT_EQ(util::string_view(s), util::string_view("xy" )); |
75 | ASSERT_EQ(std::memcmp(s.data(), "xy" , 2), 0); |
76 | ASSERT_EQ(s, "xy" ); |
77 | ASSERT_NE(s, "xyz" ); |
78 | } |
79 | |
80 | TEST(SmallString, Substr) { |
81 | using SS = SmallString<5>; |
82 | { |
83 | auto s = SS(); |
84 | ASSERT_EQ(s.substr(0), "" ); |
85 | ASSERT_EQ(s.substr(0, 2), "" ); |
86 | } |
87 | { |
88 | auto s = SS("abcd" ); |
89 | ASSERT_EQ(s.substr(0), "abcd" ); |
90 | ASSERT_EQ(s.substr(1), "bcd" ); |
91 | ASSERT_EQ(s.substr(4), "" ); |
92 | ASSERT_EQ(s.substr(0, 0), "" ); |
93 | ASSERT_EQ(s.substr(0, 3), "abc" ); |
94 | ASSERT_EQ(s.substr(0, 4), "abcd" ); |
95 | ASSERT_EQ(s.substr(1, 0), "" ); |
96 | ASSERT_EQ(s.substr(1, 2), "bc" ); |
97 | ASSERT_EQ(s.substr(4, 0), "" ); |
98 | ASSERT_EQ(s.substr(4, 1), "" ); |
99 | } |
100 | } |
101 | |
102 | static std::vector<std::string> AllNulls() { |
103 | return {"#N/A" , "#N/A N/A" , "#NA" , "-1.#IND" , "-1.#QNAN" , "-NaN" , "-nan" , "1.#IND" , |
104 | "1.#QNAN" , "N/A" , "NA" , "NULL" , "NaN" , "n/a" , "nan" , "null" }; |
105 | } |
106 | |
107 | static void TestTrieContents(const Trie& trie, const std::vector<std::string>& entries) { |
108 | std::unordered_map<std::string, int32_t> control; |
109 | auto n_entries = static_cast<int32_t>(entries.size()); |
110 | |
111 | // Build control container |
112 | for (int32_t i = 0; i < n_entries; ++i) { |
113 | auto p = control.insert({entries[i], i}); |
114 | ASSERT_TRUE(p.second); |
115 | } |
116 | |
117 | // Check all existing entries in trie |
118 | for (int32_t i = 0; i < n_entries; ++i) { |
119 | ASSERT_EQ(i, trie.Find(entries[i])) << "for string '" << entries[i] << "'" ; |
120 | } |
121 | |
122 | auto CheckNotExists = [&control, &trie](const std::string& s) { |
123 | auto p = control.find(s); |
124 | if (p == control.end()) { |
125 | ASSERT_EQ(-1, trie.Find(s)) << "for string '" << s << "'" ; |
126 | } |
127 | }; |
128 | |
129 | // Check potentially non-existing strings |
130 | CheckNotExists("" ); |
131 | CheckNotExists("X" ); |
132 | CheckNotExists("abcdefxxxxxxxxxxxxxxx" ); |
133 | |
134 | // Check potentially non-existing variations of existing entries |
135 | for (const auto& e : entries) { |
136 | CheckNotExists(e + "X" ); |
137 | if (e.size() > 0) { |
138 | CheckNotExists(e.substr(0, 1)); |
139 | auto prefix = e.substr(0, e.size() - 1); |
140 | CheckNotExists(prefix); |
141 | CheckNotExists(prefix + "X" ); |
142 | auto split_at = e.size() / 2; |
143 | CheckNotExists(e.substr(0, split_at) + 'x' + e.substr(split_at + 1)); |
144 | } |
145 | } |
146 | } |
147 | |
148 | static void TestTrieContents(const std::vector<std::string>& entries) { |
149 | TrieBuilder builder; |
150 | for (const auto& s : entries) { |
151 | ASSERT_OK(builder.Append(s)); |
152 | } |
153 | const Trie trie = builder.Finish(); |
154 | ASSERT_OK(trie.Validate()); |
155 | |
156 | TestTrieContents(trie, entries); |
157 | } |
158 | |
159 | TEST(Trie, Empty) { |
160 | TrieBuilder builder; |
161 | const Trie trie = builder.Finish(); |
162 | ASSERT_OK(trie.Validate()); |
163 | |
164 | ASSERT_EQ(-1, trie.Find("" )); |
165 | ASSERT_EQ(-1, trie.Find("x" )); |
166 | } |
167 | |
168 | TEST(Trie, EmptyString) { |
169 | TrieBuilder builder; |
170 | ASSERT_OK(builder.Append("" )); |
171 | const Trie trie = builder.Finish(); |
172 | ASSERT_OK(trie.Validate()); |
173 | |
174 | ASSERT_EQ(0, trie.Find("" )); |
175 | ASSERT_EQ(-1, trie.Find("x" )); |
176 | } |
177 | |
178 | TEST(Trie, Basics1) { |
179 | TestTrieContents({"abc" , "de" , "f" }); |
180 | TestTrieContents({"abc" , "de" , "f" , "" }); |
181 | } |
182 | |
183 | TEST(Trie, Basics2) { |
184 | TestTrieContents({"a" , "abc" , "abcd" , "abcdef" }); |
185 | TestTrieContents({"" , "a" , "abc" , "abcd" , "abcdef" }); |
186 | } |
187 | |
188 | TEST(Trie, Basics3) { |
189 | TestTrieContents({"abcd" , "ab" , "a" }); |
190 | TestTrieContents({"abcd" , "ab" , "a" , "" }); |
191 | } |
192 | |
193 | TEST(Trie, LongStrings) { |
194 | TestTrieContents({"abcdefghijklmnopqr" , "abcdefghijklmnoprq" , "defghijklmnopqrst" }); |
195 | TestTrieContents({"abcdefghijklmnopqr" , "abcdefghijklmnoprq" , "abcde" }); |
196 | } |
197 | |
198 | TEST(Trie, NullChars) { |
199 | const std::string empty; |
200 | const std::string nul(1, '\x00'); |
201 | std::string a, b, c, d; |
202 | a = "x" + nul + "y" ; |
203 | b = "x" + nul + "z" ; |
204 | c = nul + "y" ; |
205 | d = nul; |
206 | ASSERT_EQ(a.length(), 3); |
207 | ASSERT_EQ(d.length(), 1); |
208 | |
209 | TestTrieContents({a, b, c, d}); |
210 | TestTrieContents({a, b, c}); |
211 | TestTrieContents({a, b, c, d, "" }); |
212 | TestTrieContents({a, b, c, "" }); |
213 | TestTrieContents({d, c, b, a}); |
214 | TestTrieContents({c, b, a}); |
215 | TestTrieContents({d, c, b, a, "" }); |
216 | TestTrieContents({c, b, a, "" }); |
217 | } |
218 | |
219 | TEST(Trie, NegativeChars) { |
220 | // Test with characters >= 0x80 (to check the absence of sign issues) |
221 | TestTrieContents({"\x7f\x80\x81\xff" , "\x7f\x80\x81" , "\x7f\xff\x81" , "\xff\x80\x81" }); |
222 | } |
223 | |
224 | TEST(Trie, CSVNulls) { TestTrieContents(AllNulls()); } |
225 | |
226 | TEST(Trie, Duplicates) { |
227 | { |
228 | TrieBuilder builder; |
229 | ASSERT_OK(builder.Append("ab" )); |
230 | ASSERT_OK(builder.Append("abc" )); |
231 | ASSERT_RAISES(Invalid, builder.Append("abc" )); |
232 | ASSERT_OK(builder.Append("abcd" )); |
233 | ASSERT_RAISES(Invalid, builder.Append("ab" )); |
234 | ASSERT_OK(builder.Append("abcde" )); |
235 | const Trie trie = builder.Finish(); |
236 | |
237 | TestTrieContents(trie, {"ab" , "abc" , "abcd" , "abcde" }); |
238 | } |
239 | { |
240 | // With allow_duplicates = true |
241 | TrieBuilder builder; |
242 | ASSERT_OK(builder.Append("ab" , true)); |
243 | ASSERT_OK(builder.Append("abc" , true)); |
244 | ASSERT_OK(builder.Append("abc" , true)); |
245 | ASSERT_OK(builder.Append("abcd" , true)); |
246 | ASSERT_OK(builder.Append("ab" , true)); |
247 | ASSERT_OK(builder.Append("abcde" , true)); |
248 | const Trie trie = builder.Finish(); |
249 | |
250 | TestTrieContents(trie, {"ab" , "abc" , "abcd" , "abcde" }); |
251 | } |
252 | } |
253 | |
254 | TEST(Trie, CapacityError) { |
255 | // A trie uses 16-bit indices into various internal structures and |
256 | // therefore has limited size available. |
257 | TrieBuilder builder; |
258 | uint8_t first, second, third; |
259 | bool had_capacity_error = false; |
260 | uint8_t s[] = "\x00\x00\x00\x00" ; |
261 | |
262 | for (first = 1; first < 125; ++first) { |
263 | s[0] = first; |
264 | for (second = 1; second < 125; ++second) { |
265 | s[1] = second; |
266 | for (third = 1; third < 125; ++third) { |
267 | s[2] = third; |
268 | auto st = builder.Append(reinterpret_cast<const char*>(s)); |
269 | if (st.IsCapacityError()) { |
270 | DCHECK_GE(first, 2); |
271 | had_capacity_error = true; |
272 | break; |
273 | } else { |
274 | ASSERT_OK(st); |
275 | } |
276 | } |
277 | } |
278 | } |
279 | ASSERT_TRUE(had_capacity_error) << "Should have produced CapacityError" ; |
280 | } |
281 | |
282 | } // namespace internal |
283 | } // namespace arrow |
284 | |