1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include <algorithm>
19#include <cstdint>
20#include <cstring>
21#include <string>
22#include <unordered_map>
23#include <utility>
24#include <vector>
25
26#include <gtest/gtest.h>
27
28#include "arrow/test-util.h"
29#include "arrow/util/trie.h"
30
31namespace arrow {
32namespace internal {
33
34TEST(SmallString, Basics) {
35 using SS = SmallString<5>;
36 {
37 SS s;
38 ASSERT_EQ(s.length(), 0);
39 ASSERT_EQ(util::string_view(s), util::string_view(""));
40 ASSERT_EQ(s, "");
41 ASSERT_NE(s, "x");
42 ASSERT_EQ(sizeof(s), 6);
43 }
44 {
45 SS s("abc");
46 ASSERT_EQ(s.length(), 3);
47 ASSERT_EQ(util::string_view(s), util::string_view("abc"));
48 ASSERT_EQ(std::memcmp(s.data(), "abc", 3), 0);
49 ASSERT_EQ(s, "abc");
50 ASSERT_NE(s, "ab");
51 }
52}
53
54TEST(SmallString, Assign) {
55 using SS = SmallString<5>;
56 auto s = SS();
57
58 s = util::string_view("abc");
59 ASSERT_EQ(s.length(), 3);
60 ASSERT_EQ(util::string_view(s), util::string_view("abc"));
61 ASSERT_EQ(std::memcmp(s.data(), "abc", 3), 0);
62 ASSERT_EQ(s, "abc");
63 ASSERT_NE(s, "ab");
64
65 s = std::string("ghijk");
66 ASSERT_EQ(s.length(), 5);
67 ASSERT_EQ(util::string_view(s), util::string_view("ghijk"));
68 ASSERT_EQ(std::memcmp(s.data(), "ghijk", 5), 0);
69 ASSERT_EQ(s, "ghijk");
70 ASSERT_NE(s, "");
71
72 s = SS("xy");
73 ASSERT_EQ(s.length(), 2);
74 ASSERT_EQ(util::string_view(s), util::string_view("xy"));
75 ASSERT_EQ(std::memcmp(s.data(), "xy", 2), 0);
76 ASSERT_EQ(s, "xy");
77 ASSERT_NE(s, "xyz");
78}
79
80TEST(SmallString, Substr) {
81 using SS = SmallString<5>;
82 {
83 auto s = SS();
84 ASSERT_EQ(s.substr(0), "");
85 ASSERT_EQ(s.substr(0, 2), "");
86 }
87 {
88 auto s = SS("abcd");
89 ASSERT_EQ(s.substr(0), "abcd");
90 ASSERT_EQ(s.substr(1), "bcd");
91 ASSERT_EQ(s.substr(4), "");
92 ASSERT_EQ(s.substr(0, 0), "");
93 ASSERT_EQ(s.substr(0, 3), "abc");
94 ASSERT_EQ(s.substr(0, 4), "abcd");
95 ASSERT_EQ(s.substr(1, 0), "");
96 ASSERT_EQ(s.substr(1, 2), "bc");
97 ASSERT_EQ(s.substr(4, 0), "");
98 ASSERT_EQ(s.substr(4, 1), "");
99 }
100}
101
102static std::vector<std::string> AllNulls() {
103 return {"#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND",
104 "1.#QNAN", "N/A", "NA", "NULL", "NaN", "n/a", "nan", "null"};
105}
106
107static void TestTrieContents(const Trie& trie, const std::vector<std::string>& entries) {
108 std::unordered_map<std::string, int32_t> control;
109 auto n_entries = static_cast<int32_t>(entries.size());
110
111 // Build control container
112 for (int32_t i = 0; i < n_entries; ++i) {
113 auto p = control.insert({entries[i], i});
114 ASSERT_TRUE(p.second);
115 }
116
117 // Check all existing entries in trie
118 for (int32_t i = 0; i < n_entries; ++i) {
119 ASSERT_EQ(i, trie.Find(entries[i])) << "for string '" << entries[i] << "'";
120 }
121
122 auto CheckNotExists = [&control, &trie](const std::string& s) {
123 auto p = control.find(s);
124 if (p == control.end()) {
125 ASSERT_EQ(-1, trie.Find(s)) << "for string '" << s << "'";
126 }
127 };
128
129 // Check potentially non-existing strings
130 CheckNotExists("");
131 CheckNotExists("X");
132 CheckNotExists("abcdefxxxxxxxxxxxxxxx");
133
134 // Check potentially non-existing variations of existing entries
135 for (const auto& e : entries) {
136 CheckNotExists(e + "X");
137 if (e.size() > 0) {
138 CheckNotExists(e.substr(0, 1));
139 auto prefix = e.substr(0, e.size() - 1);
140 CheckNotExists(prefix);
141 CheckNotExists(prefix + "X");
142 auto split_at = e.size() / 2;
143 CheckNotExists(e.substr(0, split_at) + 'x' + e.substr(split_at + 1));
144 }
145 }
146}
147
148static void TestTrieContents(const std::vector<std::string>& entries) {
149 TrieBuilder builder;
150 for (const auto& s : entries) {
151 ASSERT_OK(builder.Append(s));
152 }
153 const Trie trie = builder.Finish();
154 ASSERT_OK(trie.Validate());
155
156 TestTrieContents(trie, entries);
157}
158
159TEST(Trie, Empty) {
160 TrieBuilder builder;
161 const Trie trie = builder.Finish();
162 ASSERT_OK(trie.Validate());
163
164 ASSERT_EQ(-1, trie.Find(""));
165 ASSERT_EQ(-1, trie.Find("x"));
166}
167
168TEST(Trie, EmptyString) {
169 TrieBuilder builder;
170 ASSERT_OK(builder.Append(""));
171 const Trie trie = builder.Finish();
172 ASSERT_OK(trie.Validate());
173
174 ASSERT_EQ(0, trie.Find(""));
175 ASSERT_EQ(-1, trie.Find("x"));
176}
177
178TEST(Trie, Basics1) {
179 TestTrieContents({"abc", "de", "f"});
180 TestTrieContents({"abc", "de", "f", ""});
181}
182
183TEST(Trie, Basics2) {
184 TestTrieContents({"a", "abc", "abcd", "abcdef"});
185 TestTrieContents({"", "a", "abc", "abcd", "abcdef"});
186}
187
188TEST(Trie, Basics3) {
189 TestTrieContents({"abcd", "ab", "a"});
190 TestTrieContents({"abcd", "ab", "a", ""});
191}
192
193TEST(Trie, LongStrings) {
194 TestTrieContents({"abcdefghijklmnopqr", "abcdefghijklmnoprq", "defghijklmnopqrst"});
195 TestTrieContents({"abcdefghijklmnopqr", "abcdefghijklmnoprq", "abcde"});
196}
197
198TEST(Trie, NullChars) {
199 const std::string empty;
200 const std::string nul(1, '\x00');
201 std::string a, b, c, d;
202 a = "x" + nul + "y";
203 b = "x" + nul + "z";
204 c = nul + "y";
205 d = nul;
206 ASSERT_EQ(a.length(), 3);
207 ASSERT_EQ(d.length(), 1);
208
209 TestTrieContents({a, b, c, d});
210 TestTrieContents({a, b, c});
211 TestTrieContents({a, b, c, d, ""});
212 TestTrieContents({a, b, c, ""});
213 TestTrieContents({d, c, b, a});
214 TestTrieContents({c, b, a});
215 TestTrieContents({d, c, b, a, ""});
216 TestTrieContents({c, b, a, ""});
217}
218
219TEST(Trie, NegativeChars) {
220 // Test with characters >= 0x80 (to check the absence of sign issues)
221 TestTrieContents({"\x7f\x80\x81\xff", "\x7f\x80\x81", "\x7f\xff\x81", "\xff\x80\x81"});
222}
223
224TEST(Trie, CSVNulls) { TestTrieContents(AllNulls()); }
225
226TEST(Trie, Duplicates) {
227 {
228 TrieBuilder builder;
229 ASSERT_OK(builder.Append("ab"));
230 ASSERT_OK(builder.Append("abc"));
231 ASSERT_RAISES(Invalid, builder.Append("abc"));
232 ASSERT_OK(builder.Append("abcd"));
233 ASSERT_RAISES(Invalid, builder.Append("ab"));
234 ASSERT_OK(builder.Append("abcde"));
235 const Trie trie = builder.Finish();
236
237 TestTrieContents(trie, {"ab", "abc", "abcd", "abcde"});
238 }
239 {
240 // With allow_duplicates = true
241 TrieBuilder builder;
242 ASSERT_OK(builder.Append("ab", true));
243 ASSERT_OK(builder.Append("abc", true));
244 ASSERT_OK(builder.Append("abc", true));
245 ASSERT_OK(builder.Append("abcd", true));
246 ASSERT_OK(builder.Append("ab", true));
247 ASSERT_OK(builder.Append("abcde", true));
248 const Trie trie = builder.Finish();
249
250 TestTrieContents(trie, {"ab", "abc", "abcd", "abcde"});
251 }
252}
253
254TEST(Trie, CapacityError) {
255 // A trie uses 16-bit indices into various internal structures and
256 // therefore has limited size available.
257 TrieBuilder builder;
258 uint8_t first, second, third;
259 bool had_capacity_error = false;
260 uint8_t s[] = "\x00\x00\x00\x00";
261
262 for (first = 1; first < 125; ++first) {
263 s[0] = first;
264 for (second = 1; second < 125; ++second) {
265 s[1] = second;
266 for (third = 1; third < 125; ++third) {
267 s[2] = third;
268 auto st = builder.Append(reinterpret_cast<const char*>(s));
269 if (st.IsCapacityError()) {
270 DCHECK_GE(first, 2);
271 had_capacity_error = true;
272 break;
273 } else {
274 ASSERT_OK(st);
275 }
276 }
277 }
278 }
279 ASSERT_TRUE(had_capacity_error) << "Should have produced CapacityError";
280}
281
282} // namespace internal
283} // namespace arrow
284