1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef ARROW_UTIL_TRIE_H |
19 | #define ARROW_UTIL_TRIE_H |
20 | |
21 | #include <cassert> |
22 | #include <cstdint> |
23 | #include <cstring> |
24 | #include <limits> |
25 | #include <string> |
26 | #include <utility> |
27 | #include <vector> |
28 | |
29 | #include "arrow/status.h" |
30 | #include "arrow/util/macros.h" |
31 | #include "arrow/util/string_view.h" |
32 | #include "arrow/util/visibility.h" |
33 | |
34 | namespace arrow { |
35 | namespace internal { |
36 | |
37 | // A non-zero-terminated small string class. |
38 | // std::string usually has a small string optimization |
39 | // (see review at https://shaharmike.com/cpp/std-string/) |
40 | // but this one allows tight control and optimization of memory layout. |
41 | template <uint8_t N> |
42 | class SmallString { |
43 | public: |
44 | SmallString() : length_(0) {} |
45 | |
46 | template <typename T> |
47 | SmallString(const T& v) { // NOLINT implicit constructor |
48 | *this = util::string_view(v); |
49 | } |
50 | |
51 | SmallString& operator=(const util::string_view s) { |
52 | #ifndef NDEBUG |
53 | CheckSize(s.size()); |
54 | #endif |
55 | length_ = static_cast<uint8_t>(s.size()); |
56 | std::memcpy(data_, s.data(), length_); |
57 | return *this; |
58 | } |
59 | |
60 | SmallString& operator=(const std::string& s) { |
61 | *this = util::string_view(s); |
62 | return *this; |
63 | } |
64 | |
65 | SmallString& operator=(const char* s) { |
66 | *this = util::string_view(s); |
67 | return *this; |
68 | } |
69 | |
70 | explicit operator util::string_view() const { |
71 | return util::string_view(data_, length_); |
72 | } |
73 | |
74 | const char* data() const { return data_; } |
75 | size_t length() const { return length_; } |
76 | bool empty() const { return length_ == 0; } |
77 | char operator[](size_t pos) const { |
78 | #ifdef NDEBUG |
79 | assert(pos <= length_); |
80 | #endif |
81 | return data_[pos]; |
82 | } |
83 | |
84 | SmallString substr(size_t pos) const { |
85 | return SmallString(util::string_view(*this).substr(pos)); |
86 | } |
87 | |
88 | SmallString substr(size_t pos, size_t count) const { |
89 | return SmallString(util::string_view(*this).substr(pos, count)); |
90 | } |
91 | |
92 | template <typename T> |
93 | bool operator==(T&& other) const { |
94 | return util::string_view(*this) == util::string_view(std::forward<T>(other)); |
95 | } |
96 | |
97 | template <typename T> |
98 | bool operator!=(T&& other) const { |
99 | return util::string_view(*this) != util::string_view(std::forward<T>(other)); |
100 | } |
101 | |
102 | protected: |
103 | uint8_t length_; |
104 | char data_[N]; |
105 | |
106 | #ifndef NDEBUG |
107 | void CheckSize(size_t n) { assert(n <= N); } |
108 | #endif |
109 | }; |
110 | |
111 | template <uint8_t N> |
112 | std::ostream& operator<<(std::ostream& os, const SmallString<N>& str) { |
113 | return os << util::string_view(str); |
114 | } |
115 | |
116 | // A trie class for byte strings, optimized for small sets of short strings. |
117 | // This class is immutable by design, use a TrieBuilder to construct it. |
118 | class ARROW_EXPORT Trie { |
119 | using index_type = int16_t; |
120 | using fast_index_type = int_fast16_t; |
121 | |
122 | public: |
123 | Trie() : size_(0) {} |
124 | Trie(Trie&&) = default; |
125 | Trie& operator=(Trie&&) = default; |
126 | |
127 | int32_t Find(util::string_view s) const { |
128 | const Node* node = &nodes_[0]; |
129 | fast_index_type pos = 0; |
130 | fast_index_type remaining = static_cast<fast_index_type>(s.length()); |
131 | |
132 | while (remaining > 0) { |
133 | auto substring_length = node->substring_length(); |
134 | if (substring_length > 0) { |
135 | auto substring_data = node->substring_data(); |
136 | if (remaining < substring_length) { |
137 | // Input too short |
138 | return -1; |
139 | } |
140 | for (fast_index_type i = 0; i < substring_length; ++i) { |
141 | if (s[pos++] != substring_data[i]) { |
142 | // Mismatching substring |
143 | return -1; |
144 | } |
145 | --remaining; |
146 | } |
147 | if (remaining == 0) { |
148 | // Matched node exactly |
149 | return node->found_index_; |
150 | } |
151 | } |
152 | // Lookup child using next input character |
153 | if (node->child_lookup_ == -1) { |
154 | // Input too long |
155 | return -1; |
156 | } |
157 | auto c = static_cast<uint8_t>(s[pos++]); |
158 | --remaining; |
159 | auto child_index = lookup_table_[node->child_lookup_ * 256 + c]; |
160 | if (child_index == -1) { |
161 | // Child not found |
162 | return -1; |
163 | } |
164 | node = &nodes_[child_index]; |
165 | } |
166 | |
167 | // Input exhausted |
168 | if (node->substring_.empty()) { |
169 | // Matched node exactly |
170 | return node->found_index_; |
171 | } else { |
172 | return -1; |
173 | } |
174 | } |
175 | |
176 | Status Validate() const; |
177 | |
178 | void Dump() const; |
179 | |
180 | protected: |
181 | static constexpr size_t kNodeSize = 16; |
182 | static constexpr auto kMaxSubstringLength = |
183 | kNodeSize - 2 * sizeof(index_type) - sizeof(int8_t); |
184 | |
185 | struct Node { |
186 | // If this node is a valid end of string, index of found string, otherwise -1 |
187 | index_type found_index_; |
188 | // Base index for child lookup in lookup_table_ (-1 if no child nodes) |
189 | index_type child_lookup_; |
190 | // The substring for this node. |
191 | SmallString<kMaxSubstringLength> substring_; |
192 | |
193 | fast_index_type substring_length() const { |
194 | return static_cast<fast_index_type>(substring_.length()); |
195 | } |
196 | const char* substring_data() const { return substring_.data(); } |
197 | }; |
198 | |
199 | static_assert(sizeof(Node) == kNodeSize, "Unexpected node size" ); |
200 | |
201 | ARROW_DISALLOW_COPY_AND_ASSIGN(Trie); |
202 | |
203 | void Dump(const Node* node, const std::string& indent) const; |
204 | |
205 | // Node table: entry 0 is the root node |
206 | std::vector<Node> nodes_; |
207 | |
208 | // Indexed lookup structure: gives index in node table, or -1 if not found |
209 | std::vector<index_type> lookup_table_; |
210 | |
211 | // Number of entries |
212 | index_type size_; |
213 | |
214 | friend class TrieBuilder; |
215 | }; |
216 | |
217 | class ARROW_EXPORT TrieBuilder { |
218 | using index_type = Trie::index_type; |
219 | using fast_index_type = Trie::fast_index_type; |
220 | |
221 | public: |
222 | TrieBuilder(); |
223 | Status Append(util::string_view s, bool allow_duplicate = false); |
224 | Trie Finish(); |
225 | |
226 | protected: |
227 | // Extend the lookup table by 256 entries, return the index of the new span |
228 | Status ExtendLookupTable(index_type* out_lookup_index); |
229 | // Split the node given by the index at the substring index `split_at` |
230 | Status SplitNode(fast_index_type node_index, fast_index_type split_at); |
231 | // Append an already constructed child node to the parent |
232 | Status AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node); |
233 | // Create a matching child node from this parent |
234 | Status CreateChildNode(Trie::Node* parent, uint8_t ch, util::string_view substring); |
235 | Status CreateChildNode(Trie::Node* parent, char ch, util::string_view substring); |
236 | |
237 | Trie trie_; |
238 | |
239 | static constexpr auto kMaxIndex = std::numeric_limits<index_type>::max(); |
240 | }; |
241 | |
242 | } // namespace internal |
243 | } // namespace arrow |
244 | |
245 | #endif // ARROW_UTIL_TRIE_H |
246 | |