1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include "arrow/util/trie.h"
19
20#include <iostream>
21#include <utility>
22
23#include "arrow/util/logging.h"
24
25namespace arrow {
26namespace internal {
27
28Status Trie::Validate() const {
29 const auto n_nodes = static_cast<fast_index_type>(nodes_.size());
30 if (size_ > n_nodes) {
31 return Status::Invalid("Number of entries larger than number of nodes");
32 }
33 for (const auto& node : nodes_) {
34 if (node.found_index_ >= size_) {
35 return Status::Invalid("Found index >= size");
36 }
37 if (node.child_lookup_ != -1 &&
38 node.child_lookup_ * 256 >
39 static_cast<fast_index_type>(lookup_table_.size() - 256)) {
40 return Status::Invalid("Child lookup base doesn't point to 256 valid indices");
41 }
42 }
43 for (const auto index : lookup_table_) {
44 if (index >= n_nodes) {
45 return Status::Invalid("Child lookup index out of bounds");
46 }
47 }
48 return Status::OK();
49}
50
51void Trie::Dump(const Node* node, const std::string& indent) const {
52 std::cerr << "[\"" << node->substring_ << "\"]";
53 if (node->found_index_ >= 0) {
54 std::cerr << " *";
55 }
56 std::cerr << "\n";
57 if (node->child_lookup_ >= 0) {
58 auto child_indent = indent + " ";
59 std::cerr << child_indent << "|\n";
60 for (fast_index_type i = 0; i < 256; ++i) {
61 auto child_index = lookup_table_[node->child_lookup_ * 256 + i];
62 if (child_index >= 0) {
63 const Node* child = &nodes_[child_index];
64 std::cerr << child_indent << "|-> '" << static_cast<char>(i) << "' (" << i
65 << ") -> ";
66 Dump(child, child_indent);
67 }
68 }
69 }
70}
71
72void Trie::Dump() const { Dump(&nodes_[0], ""); }
73
74TrieBuilder::TrieBuilder() { trie_.nodes_.push_back(Trie::Node{-1, -1, ""}); }
75
76Status TrieBuilder::AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node) {
77 if (parent->child_lookup_ == -1) {
78 RETURN_NOT_OK(ExtendLookupTable(&parent->child_lookup_));
79 }
80 auto parent_lookup = parent->child_lookup_ * 256 + ch;
81
82 DCHECK_EQ(trie_.lookup_table_[parent_lookup], -1);
83 if (trie_.nodes_.size() >= static_cast<size_t>(kMaxIndex)) {
84 return Status::CapacityError("Trie out of bounds");
85 }
86 trie_.nodes_.push_back(std::move(node));
87 trie_.lookup_table_[parent_lookup] = static_cast<index_type>(trie_.nodes_.size() - 1);
88 return Status::OK();
89}
90
91Status TrieBuilder::CreateChildNode(Trie::Node* parent, uint8_t ch,
92 util::string_view substring) {
93 const auto kMaxSubstringLength = Trie::kMaxSubstringLength;
94
95 while (substring.length() > kMaxSubstringLength) {
96 // Substring doesn't fit in node => create intermediate node
97 auto mid_node = Trie::Node{-1, -1, substring.substr(0, kMaxSubstringLength)};
98 RETURN_NOT_OK(AppendChildNode(parent, ch, std::move(mid_node)));
99 // Recurse
100 parent = &trie_.nodes_.back();
101 ch = static_cast<uint8_t>(substring[kMaxSubstringLength]);
102 substring = substring.substr(kMaxSubstringLength + 1);
103 }
104
105 // Create final matching node
106 auto child_node = Trie::Node{trie_.size_, -1, substring};
107 RETURN_NOT_OK(AppendChildNode(parent, ch, std::move(child_node)));
108 ++trie_.size_;
109 return Status::OK();
110}
111
112Status TrieBuilder::CreateChildNode(Trie::Node* parent, char ch,
113 util::string_view substring) {
114 return CreateChildNode(parent, static_cast<uint8_t>(ch), substring);
115}
116
117Status TrieBuilder::ExtendLookupTable(index_type* out_index) {
118 auto cur_size = trie_.lookup_table_.size();
119 auto cur_index = cur_size / 256;
120 if (cur_index > static_cast<size_t>(kMaxIndex)) {
121 return Status::CapacityError("Trie out of bounds");
122 }
123 trie_.lookup_table_.resize(cur_size + 256, -1);
124 *out_index = static_cast<index_type>(cur_index);
125 return Status::OK();
126}
127
128Status TrieBuilder::SplitNode(fast_index_type node_index, fast_index_type split_at) {
129 Trie::Node* node = &trie_.nodes_[node_index];
130
131 DCHECK_LT(split_at, node->substring_length());
132
133 // Before:
134 // {node} -> [...]
135 // After:
136 // {node} -> [c] -> {out_node} -> [...]
137 auto child_node = Trie::Node{node->found_index_, node->child_lookup_,
138 node->substring_.substr(split_at + 1)};
139 auto ch = node->substring_[split_at];
140 node->child_lookup_ = -1;
141 node->found_index_ = -1;
142 node->substring_ = node->substring_.substr(0, split_at);
143 RETURN_NOT_OK(AppendChildNode(node, ch, std::move(child_node)));
144
145 return Status::OK();
146}
147
148Status TrieBuilder::Append(util::string_view s, bool allow_duplicate) {
149 // Find or create node for string
150 fast_index_type node_index = 0;
151 fast_index_type pos = 0;
152 fast_index_type remaining = static_cast<fast_index_type>(s.length());
153
154 while (true) {
155 Trie::Node* node = &trie_.nodes_[node_index];
156 const auto substring_length = node->substring_length();
157 const auto substring_data = node->substring_data();
158
159 for (fast_index_type i = 0; i < substring_length; ++i) {
160 if (remaining == 0) {
161 // New string too short => need to split node
162 RETURN_NOT_OK(SplitNode(node_index, i));
163 // Current node matches exactly
164 node = &trie_.nodes_[node_index];
165 node->found_index_ = trie_.size_++;
166 return Status::OK();
167 }
168 if (s[pos] != substring_data[i]) {
169 // Mismatching substring => need to split node
170 RETURN_NOT_OK(SplitNode(node_index, i));
171 // Create new node for mismatching char
172 node = &trie_.nodes_[node_index];
173 return CreateChildNode(node, s[pos], s.substr(pos + 1));
174 }
175 ++pos;
176 --remaining;
177 }
178 if (remaining == 0) {
179 // Node matches exactly
180 if (node->found_index_ >= 0) {
181 if (allow_duplicate) {
182 return Status::OK();
183 } else {
184 return Status::Invalid("Duplicate entry in trie");
185 }
186 }
187 node->found_index_ = trie_.size_++;
188 return Status::OK();
189 }
190 // Lookup child using next input character
191 if (node->child_lookup_ == -1) {
192 // Need to extend lookup table for this node
193 RETURN_NOT_OK(ExtendLookupTable(&node->child_lookup_));
194 }
195 auto c = static_cast<uint8_t>(s[pos++]);
196 --remaining;
197 node_index = trie_.lookup_table_[node->child_lookup_ * 256 + c];
198 if (node_index == -1) {
199 // Child not found => need to create child node
200 return CreateChildNode(node, c, s.substr(pos));
201 }
202 node = &trie_.nodes_[node_index];
203 }
204}
205
206Trie TrieBuilder::Finish() { return std::move(trie_); }
207
208} // namespace internal
209} // namespace arrow
210