1#ifndef DAWGDIC_DAWG_BUILDER_H
2#define DAWGDIC_DAWG_BUILDER_H
3
4#include <algorithm>
5#include <stack>
6#include <vector>
7
8#include "dawg.h"
9#include "dawg-unit.h"
10
11namespace dawgdic {
12
13// DAWG builder.
14class DawgBuilder {
15 public:
16 explicit DawgBuilder(SizeType initial_hash_table_size =
17 DEFAULT_INITIAL_HASH_TABLE_SIZE)
18 : initial_hash_table_size_(initial_hash_table_size),
19 base_pool_(), label_pool_(), flag_pool_(), unit_pool_(),
20 hash_table_(), unfixed_units_(), unused_units_(), num_of_states_(1),
21 num_of_merged_transitions_(0), num_of_merging_states_(0) {}
22
23 // Number of units.
24 SizeType size() const {
25 return base_pool_.size();
26 }
27 // Number of transitions.
28 SizeType num_of_transitions() const {
29 return base_pool_.size() - 1;
30 }
31 // Number of states.
32 SizeType num_of_states() const {
33 return num_of_states_;
34 }
35 // Number of merged transitions.
36 SizeType num_of_merged_transitions() const {
37 return num_of_merged_transitions_;
38 }
39 // Number of merged states.
40 SizeType num_of_merged_states() const {
41 return num_of_transitions()
42 + num_of_merged_transitions() + 1 - num_of_states();
43 }
44 // Number of merging states.
45 SizeType num_of_merging_states() const {
46 return num_of_merging_states_;
47 }
48
49 // Initializes a builder.
50 void Clear() {
51 base_pool_.Clear();
52 label_pool_.Clear();
53 flag_pool_.Clear();
54 unit_pool_.Clear();
55
56 std::vector<BaseType>(0).swap(hash_table_);
57 while (!unfixed_units_.empty()) {
58 unfixed_units_.pop();
59 }
60 while (!unused_units_.empty()) {
61 unused_units_.pop();
62 }
63
64 num_of_states_ = 1;
65 num_of_merged_transitions_ = 0;
66 num_of_merging_states_ = 0;
67 }
68
69 // Inserts a key.
70 bool Insert(const CharType *key, ValueType value = 0) {
71 if (key == NULL || *key == '\0' || value < 0) {
72 return false;
73 }
74 SizeType length = 1;
75 while (key[length]) {
76 ++length;
77 }
78 return InsertKey(key, length, value);
79 }
80
81 // Inserts a key.
82 bool Insert(const CharType *key, SizeType length, ValueType value) {
83 if (key == NULL || length <= 0 || value < 0) {
84 return false;
85 }
86 for (SizeType i = 0; i < length; ++i) {
87 if (key[i] == '\0') {
88 return false;
89 }
90 }
91 return InsertKey(key, length, value);
92 }
93
94 // Finishes building a dawg.
95 bool Finish(Dawg *dawg) {
96 // Initializes a builder if not initialized.
97 if (hash_table_.empty()) {
98 Init();
99 }
100
101 FixUnits(0);
102 base_pool_[0].set_base(unit_pool_[0].base());
103 label_pool_[0] = unit_pool_[0].label();
104
105 dawg->set_num_of_states(num_of_states_);
106 dawg->set_num_of_merged_transitions(num_of_merged_transitions_);
107 dawg->set_num_of_merged_states(num_of_merged_states());
108 dawg->set_num_of_merging_states(num_of_merging_states_);
109
110 dawg->SwapBasePool(&base_pool_);
111 dawg->SwapLabelPool(&label_pool_);
112 dawg->SwapFlagPool(&flag_pool_);
113
114 Clear();
115 return true;
116 }
117
118 private:
119 enum {
120 DEFAULT_INITIAL_HASH_TABLE_SIZE = 1 << 8
121 };
122
123 const SizeType initial_hash_table_size_;
124 ObjectPool<BaseUnit> base_pool_;
125 ObjectPool<UCharType> label_pool_;
126 BitPool<> flag_pool_;
127 ObjectPool<DawgUnit> unit_pool_;
128 std::vector<BaseType> hash_table_;
129 std::stack<BaseType> unfixed_units_;
130 std::stack<BaseType> unused_units_;
131 SizeType num_of_states_;
132 SizeType num_of_merged_transitions_;
133 SizeType num_of_merging_states_;
134
135 // Disallows copies.
136 DawgBuilder(const DawgBuilder &);
137 DawgBuilder &operator=(const DawgBuilder &);
138
139 // Inserts a key.
140 bool InsertKey(const CharType *key, SizeType length, ValueType value) {
141 // Initializes a builder if not initialized.
142 if (hash_table_.empty()) {
143 Init();
144 }
145
146 BaseType index = 0;
147 SizeType key_pos = 0;
148
149 // Finds a separate unit.
150 for ( ; key_pos <= length; ++key_pos) {
151 BaseType child_index = unit_pool_[index].child();
152 if (!child_index) {
153 break;
154 }
155
156 UCharType key_label = static_cast<UCharType>(
157 (key_pos < length) ? key[key_pos] : '\0');
158 UCharType unit_label = unit_pool_[child_index].label();
159
160 // Checks the order of keys.
161 if (key_label < unit_label) {
162 return false;
163 } else if (key_label > unit_label) {
164 unit_pool_[child_index].set_has_sibling(true);
165 FixUnits(child_index);
166 break;
167 }
168
169 index = child_index;
170 }
171
172 // Adds new units.
173 for ( ; key_pos <= length; ++key_pos) {
174 UCharType key_label = static_cast<UCharType>(
175 (key_pos < length) ? key[key_pos] : '\0');
176 BaseType child_index = AllocateUnit();
177
178 if (!unit_pool_[index].child()) {
179 unit_pool_[child_index].set_is_state(true);
180 }
181 unit_pool_[child_index].set_sibling(unit_pool_[index].child());
182 unit_pool_[child_index].set_label(key_label);
183 unit_pool_[index].set_child(child_index);
184 unfixed_units_.push(child_index);
185
186 index = child_index;
187 }
188 unit_pool_[index].set_value(value);
189 return true;
190 }
191
192 // Initializes an object.
193 void Init() {
194 hash_table_.resize(initial_hash_table_size_, 0);
195 AllocateUnit();
196 AllocateTransition();
197 unit_pool_[0].set_label(0xFF);
198 unfixed_units_.push(0);
199 }
200
201 // Fixes units corresponding to the last inserted key.
202 // Also, some of units are merged into equivalent transitions.
203 void FixUnits(BaseType index) {
204 while (unfixed_units_.top() != index) {
205 BaseType unfixed_index = unfixed_units_.top();
206 unfixed_units_.pop();
207
208 if (num_of_states_ >= hash_table_.size() - (hash_table_.size() >> 2)) {
209 ExpandHashTable();
210 }
211
212 BaseType num_of_siblings = 0;
213 for (BaseType i = unfixed_index; i != 0; i = unit_pool_[i].sibling()) {
214 ++num_of_siblings;
215 }
216
217 BaseType hash_id;
218 BaseType matched_index = FindUnit(unfixed_index, &hash_id);
219 if (matched_index != 0) {
220 num_of_merged_transitions_ += num_of_siblings;
221
222 // Records a merging state.
223 if (flag_pool_.get(matched_index) == false) {
224 ++num_of_merging_states_;
225 flag_pool_.set(matched_index, true);
226 }
227 } else {
228 // Fixes units into pairs of base values and labels.
229 BaseType transition_index = 0;
230 for (BaseType i = 0; i < num_of_siblings; ++i) {
231 transition_index = AllocateTransition();
232 }
233 for (BaseType i = unfixed_index; i != 0; i = unit_pool_[i].sibling()) {
234 base_pool_[transition_index].set_base(unit_pool_[i].base());
235 label_pool_[transition_index] = unit_pool_[i].label();
236 --transition_index;
237 }
238 matched_index = transition_index + 1;
239 hash_table_[hash_id] = matched_index;
240 ++num_of_states_;
241 }
242
243 // Deletes fixed units.
244 for (BaseType current = unfixed_index, next;
245 current != 0; current = next) {
246 next = unit_pool_[current].sibling();
247 FreeUnit(current);
248 }
249
250 unit_pool_[unfixed_units_.top()].set_child(matched_index);
251 }
252 unfixed_units_.pop();
253 }
254
255 // Expands a hash table.
256 void ExpandHashTable() {
257 SizeType hash_table_size = hash_table_.size() << 1;
258 std::vector<BaseType>(0).swap(hash_table_);
259 hash_table_.resize(hash_table_size, 0);
260
261 // Builds a new hash table.
262 BaseType count = 0;
263 for (SizeType i = 1; i < base_pool_.size(); ++i) {
264 BaseType index = static_cast<BaseType>(i);
265 if (label_pool_[index] == '\0' || base_pool_[index].is_state()) {
266 BaseType hash_id;
267 FindTransition(index, &hash_id);
268 hash_table_[hash_id] = index;
269 ++count;
270 }
271 }
272 }
273
274 // Finds a transition from a hash table.
275 BaseType FindTransition(BaseType index, BaseType *hash_id) const {
276 *hash_id = HashTransition(index) % hash_table_.size();
277 for ( ; ; *hash_id = (*hash_id + 1) % hash_table_.size()) {
278 BaseType transition_id = hash_table_[*hash_id];
279 if (transition_id == 0) {
280 break;
281 }
282
283 // There must not be the same base value.
284 }
285 return 0;
286 }
287
288 // Finds a unit from a hash table.
289 BaseType FindUnit(BaseType unit_index, BaseType *hash_id) const {
290 *hash_id = HashUnit(unit_index) % hash_table_.size();
291 for ( ; ; *hash_id = (*hash_id + 1) % hash_table_.size()) {
292 BaseType transition_id = hash_table_[*hash_id];
293 if (transition_id == 0) {
294 break;
295 }
296
297 if (AreEqual(unit_index, transition_id)) {
298 return transition_id;
299 }
300 }
301 return 0;
302 }
303
304 // Compares a unit and a transition.
305 bool AreEqual(BaseType unit_index, BaseType transition_index) const {
306 // Compares the numbers of transitions.
307 for (BaseType i = unit_pool_[unit_index].sibling(); i != 0;
308 i = unit_pool_[i].sibling()) {
309 if (base_pool_[transition_index].has_sibling() == false) {
310 return false;
311 }
312 ++transition_index;
313 }
314 if (base_pool_[transition_index].has_sibling() == true) {
315 return false;
316 }
317
318 // Compares out-transitions.
319 for (BaseType i = unit_index; i;
320 i = unit_pool_[i].sibling(), --transition_index) {
321 if (unit_pool_[i].base() != base_pool_[transition_index].base() ||
322 unit_pool_[i].label() != label_pool_[transition_index]) {
323 return false;
324 }
325 }
326 return true;
327 }
328
329 // Calculates a hash value from a transition.
330 BaseType HashTransition(BaseType index) const {
331 BaseType hash_value = 0;
332 for ( ; index != 0; ++index) {
333 BaseType base = base_pool_[index].base();
334 UCharType label = label_pool_[index];
335 hash_value ^= Hash((label << 24) ^ base);
336
337 if (base_pool_[index].has_sibling() == false) {
338 break;
339 }
340 }
341 return hash_value;
342 }
343
344 // Calculates a hash value from a unit.
345 BaseType HashUnit(BaseType index) const {
346 BaseType hash_value = 0;
347 for ( ; index != 0; index = unit_pool_[index].sibling()) {
348 BaseType base = unit_pool_[index].base();
349 UCharType label = unit_pool_[index].label();
350 hash_value ^= Hash((label << 24) ^ base);
351 }
352 return hash_value;
353 }
354
355 // 32-bit mix function.
356 // http://www.concentric.net/~Ttwang/tech/inthash.htm
357 static BaseType Hash(BaseType key) {
358 key = ~key + (key << 15); // key = (key << 15) - key - 1;
359 key = key ^ (key >> 12);
360 key = key + (key << 2);
361 key = key ^ (key >> 4);
362 key = key * 2057; // key = (key + (key << 3)) + (key << 11);
363 key = key ^ (key >> 16);
364 return key;
365 }
366
367 // Gets a transition from object pools.
368 BaseType AllocateTransition() {
369 flag_pool_.Allocate();
370 base_pool_.Allocate();
371 return static_cast<BaseType>(label_pool_.Allocate());
372 }
373
374 // Gets a unit from an object pool.
375 BaseType AllocateUnit() {
376 BaseType index = 0;
377 if (unused_units_.empty()) {
378 index = static_cast<BaseType>(unit_pool_.Allocate());
379 } else {
380 index = unused_units_.top();
381 unused_units_.pop();
382 }
383 unit_pool_[index].Clear();
384 return index;
385 }
386
387 // Returns a unit to an object pool.
388 void FreeUnit(BaseType index) {
389 unused_units_.push(index);
390 }
391};
392
393} // namespace dawgdic
394
395#endif // DAWGDIC_DAWG_BUILDER_H
396