1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include "arrow/util/trie.h" |
19 | |
20 | #include <iostream> |
21 | #include <utility> |
22 | |
23 | #include "arrow/util/logging.h" |
24 | |
25 | namespace arrow { |
26 | namespace internal { |
27 | |
28 | Status Trie::Validate() const { |
29 | const auto n_nodes = static_cast<fast_index_type>(nodes_.size()); |
30 | if (size_ > n_nodes) { |
31 | return Status::Invalid("Number of entries larger than number of nodes" ); |
32 | } |
33 | for (const auto& node : nodes_) { |
34 | if (node.found_index_ >= size_) { |
35 | return Status::Invalid("Found index >= size" ); |
36 | } |
37 | if (node.child_lookup_ != -1 && |
38 | node.child_lookup_ * 256 > |
39 | static_cast<fast_index_type>(lookup_table_.size() - 256)) { |
40 | return Status::Invalid("Child lookup base doesn't point to 256 valid indices" ); |
41 | } |
42 | } |
43 | for (const auto index : lookup_table_) { |
44 | if (index >= n_nodes) { |
45 | return Status::Invalid("Child lookup index out of bounds" ); |
46 | } |
47 | } |
48 | return Status::OK(); |
49 | } |
50 | |
51 | void Trie::Dump(const Node* node, const std::string& indent) const { |
52 | std::cerr << "[\"" << node->substring_ << "\"]" ; |
53 | if (node->found_index_ >= 0) { |
54 | std::cerr << " *" ; |
55 | } |
56 | std::cerr << "\n" ; |
57 | if (node->child_lookup_ >= 0) { |
58 | auto child_indent = indent + " " ; |
59 | std::cerr << child_indent << "|\n" ; |
60 | for (fast_index_type i = 0; i < 256; ++i) { |
61 | auto child_index = lookup_table_[node->child_lookup_ * 256 + i]; |
62 | if (child_index >= 0) { |
63 | const Node* child = &nodes_[child_index]; |
64 | std::cerr << child_indent << "|-> '" << static_cast<char>(i) << "' (" << i |
65 | << ") -> " ; |
66 | Dump(child, child_indent); |
67 | } |
68 | } |
69 | } |
70 | } |
71 | |
72 | void Trie::Dump() const { Dump(&nodes_[0], "" ); } |
73 | |
74 | TrieBuilder::TrieBuilder() { trie_.nodes_.push_back(Trie::Node{-1, -1, "" }); } |
75 | |
76 | Status TrieBuilder::AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node) { |
77 | if (parent->child_lookup_ == -1) { |
78 | RETURN_NOT_OK(ExtendLookupTable(&parent->child_lookup_)); |
79 | } |
80 | auto parent_lookup = parent->child_lookup_ * 256 + ch; |
81 | |
82 | DCHECK_EQ(trie_.lookup_table_[parent_lookup], -1); |
83 | if (trie_.nodes_.size() >= static_cast<size_t>(kMaxIndex)) { |
84 | return Status::CapacityError("Trie out of bounds" ); |
85 | } |
86 | trie_.nodes_.push_back(std::move(node)); |
87 | trie_.lookup_table_[parent_lookup] = static_cast<index_type>(trie_.nodes_.size() - 1); |
88 | return Status::OK(); |
89 | } |
90 | |
91 | Status TrieBuilder::CreateChildNode(Trie::Node* parent, uint8_t ch, |
92 | util::string_view substring) { |
93 | const auto kMaxSubstringLength = Trie::kMaxSubstringLength; |
94 | |
95 | while (substring.length() > kMaxSubstringLength) { |
96 | // Substring doesn't fit in node => create intermediate node |
97 | auto mid_node = Trie::Node{-1, -1, substring.substr(0, kMaxSubstringLength)}; |
98 | RETURN_NOT_OK(AppendChildNode(parent, ch, std::move(mid_node))); |
99 | // Recurse |
100 | parent = &trie_.nodes_.back(); |
101 | ch = static_cast<uint8_t>(substring[kMaxSubstringLength]); |
102 | substring = substring.substr(kMaxSubstringLength + 1); |
103 | } |
104 | |
105 | // Create final matching node |
106 | auto child_node = Trie::Node{trie_.size_, -1, substring}; |
107 | RETURN_NOT_OK(AppendChildNode(parent, ch, std::move(child_node))); |
108 | ++trie_.size_; |
109 | return Status::OK(); |
110 | } |
111 | |
112 | Status TrieBuilder::CreateChildNode(Trie::Node* parent, char ch, |
113 | util::string_view substring) { |
114 | return CreateChildNode(parent, static_cast<uint8_t>(ch), substring); |
115 | } |
116 | |
117 | Status TrieBuilder::ExtendLookupTable(index_type* out_index) { |
118 | auto cur_size = trie_.lookup_table_.size(); |
119 | auto cur_index = cur_size / 256; |
120 | if (cur_index > static_cast<size_t>(kMaxIndex)) { |
121 | return Status::CapacityError("Trie out of bounds" ); |
122 | } |
123 | trie_.lookup_table_.resize(cur_size + 256, -1); |
124 | *out_index = static_cast<index_type>(cur_index); |
125 | return Status::OK(); |
126 | } |
127 | |
128 | Status TrieBuilder::SplitNode(fast_index_type node_index, fast_index_type split_at) { |
129 | Trie::Node* node = &trie_.nodes_[node_index]; |
130 | |
131 | DCHECK_LT(split_at, node->substring_length()); |
132 | |
133 | // Before: |
134 | // {node} -> [...] |
135 | // After: |
136 | // {node} -> [c] -> {out_node} -> [...] |
137 | auto child_node = Trie::Node{node->found_index_, node->child_lookup_, |
138 | node->substring_.substr(split_at + 1)}; |
139 | auto ch = node->substring_[split_at]; |
140 | node->child_lookup_ = -1; |
141 | node->found_index_ = -1; |
142 | node->substring_ = node->substring_.substr(0, split_at); |
143 | RETURN_NOT_OK(AppendChildNode(node, ch, std::move(child_node))); |
144 | |
145 | return Status::OK(); |
146 | } |
147 | |
148 | Status TrieBuilder::Append(util::string_view s, bool allow_duplicate) { |
149 | // Find or create node for string |
150 | fast_index_type node_index = 0; |
151 | fast_index_type pos = 0; |
152 | fast_index_type remaining = static_cast<fast_index_type>(s.length()); |
153 | |
154 | while (true) { |
155 | Trie::Node* node = &trie_.nodes_[node_index]; |
156 | const auto substring_length = node->substring_length(); |
157 | const auto substring_data = node->substring_data(); |
158 | |
159 | for (fast_index_type i = 0; i < substring_length; ++i) { |
160 | if (remaining == 0) { |
161 | // New string too short => need to split node |
162 | RETURN_NOT_OK(SplitNode(node_index, i)); |
163 | // Current node matches exactly |
164 | node = &trie_.nodes_[node_index]; |
165 | node->found_index_ = trie_.size_++; |
166 | return Status::OK(); |
167 | } |
168 | if (s[pos] != substring_data[i]) { |
169 | // Mismatching substring => need to split node |
170 | RETURN_NOT_OK(SplitNode(node_index, i)); |
171 | // Create new node for mismatching char |
172 | node = &trie_.nodes_[node_index]; |
173 | return CreateChildNode(node, s[pos], s.substr(pos + 1)); |
174 | } |
175 | ++pos; |
176 | --remaining; |
177 | } |
178 | if (remaining == 0) { |
179 | // Node matches exactly |
180 | if (node->found_index_ >= 0) { |
181 | if (allow_duplicate) { |
182 | return Status::OK(); |
183 | } else { |
184 | return Status::Invalid("Duplicate entry in trie" ); |
185 | } |
186 | } |
187 | node->found_index_ = trie_.size_++; |
188 | return Status::OK(); |
189 | } |
190 | // Lookup child using next input character |
191 | if (node->child_lookup_ == -1) { |
192 | // Need to extend lookup table for this node |
193 | RETURN_NOT_OK(ExtendLookupTable(&node->child_lookup_)); |
194 | } |
195 | auto c = static_cast<uint8_t>(s[pos++]); |
196 | --remaining; |
197 | node_index = trie_.lookup_table_[node->child_lookup_ * 256 + c]; |
198 | if (node_index == -1) { |
199 | // Child not found => need to create child node |
200 | return CreateChildNode(node, c, s.substr(pos)); |
201 | } |
202 | node = &trie_.nodes_[node_index]; |
203 | } |
204 | } |
205 | |
206 | Trie TrieBuilder::Finish() { return std::move(trie_); } |
207 | |
208 | } // namespace internal |
209 | } // namespace arrow |
210 | |