1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef ARROW_UTIL_TRIE_H
19#define ARROW_UTIL_TRIE_H
20
21#include <cassert>
22#include <cstdint>
23#include <cstring>
24#include <limits>
25#include <string>
26#include <utility>
27#include <vector>
28
29#include "arrow/status.h"
30#include "arrow/util/macros.h"
31#include "arrow/util/string_view.h"
32#include "arrow/util/visibility.h"
33
34namespace arrow {
35namespace internal {
36
37// A non-zero-terminated small string class.
38// std::string usually has a small string optimization
39// (see review at https://shaharmike.com/cpp/std-string/)
40// but this one allows tight control and optimization of memory layout.
41template <uint8_t N>
42class SmallString {
43 public:
44 SmallString() : length_(0) {}
45
46 template <typename T>
47 SmallString(const T& v) { // NOLINT implicit constructor
48 *this = util::string_view(v);
49 }
50
51 SmallString& operator=(const util::string_view s) {
52#ifndef NDEBUG
53 CheckSize(s.size());
54#endif
55 length_ = static_cast<uint8_t>(s.size());
56 std::memcpy(data_, s.data(), length_);
57 return *this;
58 }
59
60 SmallString& operator=(const std::string& s) {
61 *this = util::string_view(s);
62 return *this;
63 }
64
65 SmallString& operator=(const char* s) {
66 *this = util::string_view(s);
67 return *this;
68 }
69
70 explicit operator util::string_view() const {
71 return util::string_view(data_, length_);
72 }
73
74 const char* data() const { return data_; }
75 size_t length() const { return length_; }
76 bool empty() const { return length_ == 0; }
77 char operator[](size_t pos) const {
78#ifdef NDEBUG
79 assert(pos <= length_);
80#endif
81 return data_[pos];
82 }
83
84 SmallString substr(size_t pos) const {
85 return SmallString(util::string_view(*this).substr(pos));
86 }
87
88 SmallString substr(size_t pos, size_t count) const {
89 return SmallString(util::string_view(*this).substr(pos, count));
90 }
91
92 template <typename T>
93 bool operator==(T&& other) const {
94 return util::string_view(*this) == util::string_view(std::forward<T>(other));
95 }
96
97 template <typename T>
98 bool operator!=(T&& other) const {
99 return util::string_view(*this) != util::string_view(std::forward<T>(other));
100 }
101
102 protected:
103 uint8_t length_;
104 char data_[N];
105
106#ifndef NDEBUG
107 void CheckSize(size_t n) { assert(n <= N); }
108#endif
109};
110
111template <uint8_t N>
112std::ostream& operator<<(std::ostream& os, const SmallString<N>& str) {
113 return os << util::string_view(str);
114}
115
116// A trie class for byte strings, optimized for small sets of short strings.
117// This class is immutable by design, use a TrieBuilder to construct it.
118class ARROW_EXPORT Trie {
119 using index_type = int16_t;
120 using fast_index_type = int_fast16_t;
121
122 public:
123 Trie() : size_(0) {}
124 Trie(Trie&&) = default;
125 Trie& operator=(Trie&&) = default;
126
127 int32_t Find(util::string_view s) const {
128 const Node* node = &nodes_[0];
129 fast_index_type pos = 0;
130 fast_index_type remaining = static_cast<fast_index_type>(s.length());
131
132 while (remaining > 0) {
133 auto substring_length = node->substring_length();
134 if (substring_length > 0) {
135 auto substring_data = node->substring_data();
136 if (remaining < substring_length) {
137 // Input too short
138 return -1;
139 }
140 for (fast_index_type i = 0; i < substring_length; ++i) {
141 if (s[pos++] != substring_data[i]) {
142 // Mismatching substring
143 return -1;
144 }
145 --remaining;
146 }
147 if (remaining == 0) {
148 // Matched node exactly
149 return node->found_index_;
150 }
151 }
152 // Lookup child using next input character
153 if (node->child_lookup_ == -1) {
154 // Input too long
155 return -1;
156 }
157 auto c = static_cast<uint8_t>(s[pos++]);
158 --remaining;
159 auto child_index = lookup_table_[node->child_lookup_ * 256 + c];
160 if (child_index == -1) {
161 // Child not found
162 return -1;
163 }
164 node = &nodes_[child_index];
165 }
166
167 // Input exhausted
168 if (node->substring_.empty()) {
169 // Matched node exactly
170 return node->found_index_;
171 } else {
172 return -1;
173 }
174 }
175
176 Status Validate() const;
177
178 void Dump() const;
179
180 protected:
181 static constexpr size_t kNodeSize = 16;
182 static constexpr auto kMaxSubstringLength =
183 kNodeSize - 2 * sizeof(index_type) - sizeof(int8_t);
184
185 struct Node {
186 // If this node is a valid end of string, index of found string, otherwise -1
187 index_type found_index_;
188 // Base index for child lookup in lookup_table_ (-1 if no child nodes)
189 index_type child_lookup_;
190 // The substring for this node.
191 SmallString<kMaxSubstringLength> substring_;
192
193 fast_index_type substring_length() const {
194 return static_cast<fast_index_type>(substring_.length());
195 }
196 const char* substring_data() const { return substring_.data(); }
197 };
198
199 static_assert(sizeof(Node) == kNodeSize, "Unexpected node size");
200
201 ARROW_DISALLOW_COPY_AND_ASSIGN(Trie);
202
203 void Dump(const Node* node, const std::string& indent) const;
204
205 // Node table: entry 0 is the root node
206 std::vector<Node> nodes_;
207
208 // Indexed lookup structure: gives index in node table, or -1 if not found
209 std::vector<index_type> lookup_table_;
210
211 // Number of entries
212 index_type size_;
213
214 friend class TrieBuilder;
215};
216
217class ARROW_EXPORT TrieBuilder {
218 using index_type = Trie::index_type;
219 using fast_index_type = Trie::fast_index_type;
220
221 public:
222 TrieBuilder();
223 Status Append(util::string_view s, bool allow_duplicate = false);
224 Trie Finish();
225
226 protected:
227 // Extend the lookup table by 256 entries, return the index of the new span
228 Status ExtendLookupTable(index_type* out_lookup_index);
229 // Split the node given by the index at the substring index `split_at`
230 Status SplitNode(fast_index_type node_index, fast_index_type split_at);
231 // Append an already constructed child node to the parent
232 Status AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node);
233 // Create a matching child node from this parent
234 Status CreateChildNode(Trie::Node* parent, uint8_t ch, util::string_view substring);
235 Status CreateChildNode(Trie::Node* parent, char ch, util::string_view substring);
236
237 Trie trie_;
238
239 static constexpr auto kMaxIndex = std::numeric_limits<index_type>::max();
240};
241
242} // namespace internal
243} // namespace arrow
244
245#endif // ARROW_UTIL_TRIE_H
246