1 | #ifndef DAWGDIC_DAWG_BUILDER_H |
2 | #define DAWGDIC_DAWG_BUILDER_H |
3 | |
4 | #include <algorithm> |
5 | #include <stack> |
6 | #include <vector> |
7 | |
8 | #include "dawg.h" |
9 | #include "dawg-unit.h" |
10 | |
11 | namespace dawgdic { |
12 | |
13 | // DAWG builder. |
14 | class DawgBuilder { |
15 | public: |
16 | explicit DawgBuilder(SizeType initial_hash_table_size = |
17 | DEFAULT_INITIAL_HASH_TABLE_SIZE) |
18 | : initial_hash_table_size_(initial_hash_table_size), |
19 | base_pool_(), label_pool_(), flag_pool_(), unit_pool_(), |
20 | hash_table_(), unfixed_units_(), unused_units_(), num_of_states_(1), |
21 | num_of_merged_transitions_(0), num_of_merging_states_(0) {} |
22 | |
23 | // Number of units. |
24 | SizeType size() const { |
25 | return base_pool_.size(); |
26 | } |
27 | // Number of transitions. |
28 | SizeType num_of_transitions() const { |
29 | return base_pool_.size() - 1; |
30 | } |
31 | // Number of states. |
32 | SizeType num_of_states() const { |
33 | return num_of_states_; |
34 | } |
35 | // Number of merged transitions. |
36 | SizeType num_of_merged_transitions() const { |
37 | return num_of_merged_transitions_; |
38 | } |
39 | // Number of merged states. |
40 | SizeType num_of_merged_states() const { |
41 | return num_of_transitions() |
42 | + num_of_merged_transitions() + 1 - num_of_states(); |
43 | } |
44 | // Number of merging states. |
45 | SizeType num_of_merging_states() const { |
46 | return num_of_merging_states_; |
47 | } |
48 | |
49 | // Initializes a builder. |
50 | void Clear() { |
51 | base_pool_.Clear(); |
52 | label_pool_.Clear(); |
53 | flag_pool_.Clear(); |
54 | unit_pool_.Clear(); |
55 | |
56 | std::vector<BaseType>(0).swap(hash_table_); |
57 | while (!unfixed_units_.empty()) { |
58 | unfixed_units_.pop(); |
59 | } |
60 | while (!unused_units_.empty()) { |
61 | unused_units_.pop(); |
62 | } |
63 | |
64 | num_of_states_ = 1; |
65 | num_of_merged_transitions_ = 0; |
66 | num_of_merging_states_ = 0; |
67 | } |
68 | |
69 | // Inserts a key. |
70 | bool Insert(const CharType *key, ValueType value = 0) { |
71 | if (key == NULL || *key == '\0' || value < 0) { |
72 | return false; |
73 | } |
74 | SizeType length = 1; |
75 | while (key[length]) { |
76 | ++length; |
77 | } |
78 | return InsertKey(key, length, value); |
79 | } |
80 | |
81 | // Inserts a key. |
82 | bool Insert(const CharType *key, SizeType length, ValueType value) { |
83 | if (key == NULL || length <= 0 || value < 0) { |
84 | return false; |
85 | } |
86 | for (SizeType i = 0; i < length; ++i) { |
87 | if (key[i] == '\0') { |
88 | return false; |
89 | } |
90 | } |
91 | return InsertKey(key, length, value); |
92 | } |
93 | |
94 | // Finishes building a dawg. |
95 | bool Finish(Dawg *dawg) { |
96 | // Initializes a builder if not initialized. |
97 | if (hash_table_.empty()) { |
98 | Init(); |
99 | } |
100 | |
101 | FixUnits(0); |
102 | base_pool_[0].set_base(unit_pool_[0].base()); |
103 | label_pool_[0] = unit_pool_[0].label(); |
104 | |
105 | dawg->set_num_of_states(num_of_states_); |
106 | dawg->set_num_of_merged_transitions(num_of_merged_transitions_); |
107 | dawg->set_num_of_merged_states(num_of_merged_states()); |
108 | dawg->set_num_of_merging_states(num_of_merging_states_); |
109 | |
110 | dawg->SwapBasePool(&base_pool_); |
111 | dawg->SwapLabelPool(&label_pool_); |
112 | dawg->SwapFlagPool(&flag_pool_); |
113 | |
114 | Clear(); |
115 | return true; |
116 | } |
117 | |
118 | private: |
119 | enum { |
120 | DEFAULT_INITIAL_HASH_TABLE_SIZE = 1 << 8 |
121 | }; |
122 | |
123 | const SizeType initial_hash_table_size_; |
124 | ObjectPool<BaseUnit> base_pool_; |
125 | ObjectPool<UCharType> label_pool_; |
126 | BitPool<> flag_pool_; |
127 | ObjectPool<DawgUnit> unit_pool_; |
128 | std::vector<BaseType> hash_table_; |
129 | std::stack<BaseType> unfixed_units_; |
130 | std::stack<BaseType> unused_units_; |
131 | SizeType num_of_states_; |
132 | SizeType num_of_merged_transitions_; |
133 | SizeType num_of_merging_states_; |
134 | |
135 | // Disallows copies. |
136 | DawgBuilder(const DawgBuilder &); |
137 | DawgBuilder &operator=(const DawgBuilder &); |
138 | |
139 | // Inserts a key. |
140 | bool InsertKey(const CharType *key, SizeType length, ValueType value) { |
141 | // Initializes a builder if not initialized. |
142 | if (hash_table_.empty()) { |
143 | Init(); |
144 | } |
145 | |
146 | BaseType index = 0; |
147 | SizeType key_pos = 0; |
148 | |
149 | // Finds a separate unit. |
150 | for ( ; key_pos <= length; ++key_pos) { |
151 | BaseType child_index = unit_pool_[index].child(); |
152 | if (!child_index) { |
153 | break; |
154 | } |
155 | |
156 | UCharType key_label = static_cast<UCharType>( |
157 | (key_pos < length) ? key[key_pos] : '\0'); |
158 | UCharType unit_label = unit_pool_[child_index].label(); |
159 | |
160 | // Checks the order of keys. |
161 | if (key_label < unit_label) { |
162 | return false; |
163 | } else if (key_label > unit_label) { |
164 | unit_pool_[child_index].set_has_sibling(true); |
165 | FixUnits(child_index); |
166 | break; |
167 | } |
168 | |
169 | index = child_index; |
170 | } |
171 | |
172 | // Adds new units. |
173 | for ( ; key_pos <= length; ++key_pos) { |
174 | UCharType key_label = static_cast<UCharType>( |
175 | (key_pos < length) ? key[key_pos] : '\0'); |
176 | BaseType child_index = AllocateUnit(); |
177 | |
178 | if (!unit_pool_[index].child()) { |
179 | unit_pool_[child_index].set_is_state(true); |
180 | } |
181 | unit_pool_[child_index].set_sibling(unit_pool_[index].child()); |
182 | unit_pool_[child_index].set_label(key_label); |
183 | unit_pool_[index].set_child(child_index); |
184 | unfixed_units_.push(child_index); |
185 | |
186 | index = child_index; |
187 | } |
188 | unit_pool_[index].set_value(value); |
189 | return true; |
190 | } |
191 | |
192 | // Initializes an object. |
193 | void Init() { |
194 | hash_table_.resize(initial_hash_table_size_, 0); |
195 | AllocateUnit(); |
196 | AllocateTransition(); |
197 | unit_pool_[0].set_label(0xFF); |
198 | unfixed_units_.push(0); |
199 | } |
200 | |
201 | // Fixes units corresponding to the last inserted key. |
202 | // Also, some of units are merged into equivalent transitions. |
203 | void FixUnits(BaseType index) { |
204 | while (unfixed_units_.top() != index) { |
205 | BaseType unfixed_index = unfixed_units_.top(); |
206 | unfixed_units_.pop(); |
207 | |
208 | if (num_of_states_ >= hash_table_.size() - (hash_table_.size() >> 2)) { |
209 | ExpandHashTable(); |
210 | } |
211 | |
212 | BaseType num_of_siblings = 0; |
213 | for (BaseType i = unfixed_index; i != 0; i = unit_pool_[i].sibling()) { |
214 | ++num_of_siblings; |
215 | } |
216 | |
217 | BaseType hash_id; |
218 | BaseType matched_index = FindUnit(unfixed_index, &hash_id); |
219 | if (matched_index != 0) { |
220 | num_of_merged_transitions_ += num_of_siblings; |
221 | |
222 | // Records a merging state. |
223 | if (flag_pool_.get(matched_index) == false) { |
224 | ++num_of_merging_states_; |
225 | flag_pool_.set(matched_index, true); |
226 | } |
227 | } else { |
228 | // Fixes units into pairs of base values and labels. |
229 | BaseType transition_index = 0; |
230 | for (BaseType i = 0; i < num_of_siblings; ++i) { |
231 | transition_index = AllocateTransition(); |
232 | } |
233 | for (BaseType i = unfixed_index; i != 0; i = unit_pool_[i].sibling()) { |
234 | base_pool_[transition_index].set_base(unit_pool_[i].base()); |
235 | label_pool_[transition_index] = unit_pool_[i].label(); |
236 | --transition_index; |
237 | } |
238 | matched_index = transition_index + 1; |
239 | hash_table_[hash_id] = matched_index; |
240 | ++num_of_states_; |
241 | } |
242 | |
243 | // Deletes fixed units. |
244 | for (BaseType current = unfixed_index, next; |
245 | current != 0; current = next) { |
246 | next = unit_pool_[current].sibling(); |
247 | FreeUnit(current); |
248 | } |
249 | |
250 | unit_pool_[unfixed_units_.top()].set_child(matched_index); |
251 | } |
252 | unfixed_units_.pop(); |
253 | } |
254 | |
255 | // Expands a hash table. |
256 | void ExpandHashTable() { |
257 | SizeType hash_table_size = hash_table_.size() << 1; |
258 | std::vector<BaseType>(0).swap(hash_table_); |
259 | hash_table_.resize(hash_table_size, 0); |
260 | |
261 | // Builds a new hash table. |
262 | BaseType count = 0; |
263 | for (SizeType i = 1; i < base_pool_.size(); ++i) { |
264 | BaseType index = static_cast<BaseType>(i); |
265 | if (label_pool_[index] == '\0' || base_pool_[index].is_state()) { |
266 | BaseType hash_id; |
267 | FindTransition(index, &hash_id); |
268 | hash_table_[hash_id] = index; |
269 | ++count; |
270 | } |
271 | } |
272 | } |
273 | |
274 | // Finds a transition from a hash table. |
275 | BaseType FindTransition(BaseType index, BaseType *hash_id) const { |
276 | *hash_id = HashTransition(index) % hash_table_.size(); |
277 | for ( ; ; *hash_id = (*hash_id + 1) % hash_table_.size()) { |
278 | BaseType transition_id = hash_table_[*hash_id]; |
279 | if (transition_id == 0) { |
280 | break; |
281 | } |
282 | |
283 | // There must not be the same base value. |
284 | } |
285 | return 0; |
286 | } |
287 | |
288 | // Finds a unit from a hash table. |
289 | BaseType FindUnit(BaseType unit_index, BaseType *hash_id) const { |
290 | *hash_id = HashUnit(unit_index) % hash_table_.size(); |
291 | for ( ; ; *hash_id = (*hash_id + 1) % hash_table_.size()) { |
292 | BaseType transition_id = hash_table_[*hash_id]; |
293 | if (transition_id == 0) { |
294 | break; |
295 | } |
296 | |
297 | if (AreEqual(unit_index, transition_id)) { |
298 | return transition_id; |
299 | } |
300 | } |
301 | return 0; |
302 | } |
303 | |
304 | // Compares a unit and a transition. |
305 | bool AreEqual(BaseType unit_index, BaseType transition_index) const { |
306 | // Compares the numbers of transitions. |
307 | for (BaseType i = unit_pool_[unit_index].sibling(); i != 0; |
308 | i = unit_pool_[i].sibling()) { |
309 | if (base_pool_[transition_index].has_sibling() == false) { |
310 | return false; |
311 | } |
312 | ++transition_index; |
313 | } |
314 | if (base_pool_[transition_index].has_sibling() == true) { |
315 | return false; |
316 | } |
317 | |
318 | // Compares out-transitions. |
319 | for (BaseType i = unit_index; i; |
320 | i = unit_pool_[i].sibling(), --transition_index) { |
321 | if (unit_pool_[i].base() != base_pool_[transition_index].base() || |
322 | unit_pool_[i].label() != label_pool_[transition_index]) { |
323 | return false; |
324 | } |
325 | } |
326 | return true; |
327 | } |
328 | |
329 | // Calculates a hash value from a transition. |
330 | BaseType HashTransition(BaseType index) const { |
331 | BaseType hash_value = 0; |
332 | for ( ; index != 0; ++index) { |
333 | BaseType base = base_pool_[index].base(); |
334 | UCharType label = label_pool_[index]; |
335 | hash_value ^= Hash((label << 24) ^ base); |
336 | |
337 | if (base_pool_[index].has_sibling() == false) { |
338 | break; |
339 | } |
340 | } |
341 | return hash_value; |
342 | } |
343 | |
344 | // Calculates a hash value from a unit. |
345 | BaseType HashUnit(BaseType index) const { |
346 | BaseType hash_value = 0; |
347 | for ( ; index != 0; index = unit_pool_[index].sibling()) { |
348 | BaseType base = unit_pool_[index].base(); |
349 | UCharType label = unit_pool_[index].label(); |
350 | hash_value ^= Hash((label << 24) ^ base); |
351 | } |
352 | return hash_value; |
353 | } |
354 | |
355 | // 32-bit mix function. |
356 | // http://www.concentric.net/~Ttwang/tech/inthash.htm |
357 | static BaseType Hash(BaseType key) { |
358 | key = ~key + (key << 15); // key = (key << 15) - key - 1; |
359 | key = key ^ (key >> 12); |
360 | key = key + (key << 2); |
361 | key = key ^ (key >> 4); |
362 | key = key * 2057; // key = (key + (key << 3)) + (key << 11); |
363 | key = key ^ (key >> 16); |
364 | return key; |
365 | } |
366 | |
367 | // Gets a transition from object pools. |
368 | BaseType AllocateTransition() { |
369 | flag_pool_.Allocate(); |
370 | base_pool_.Allocate(); |
371 | return static_cast<BaseType>(label_pool_.Allocate()); |
372 | } |
373 | |
374 | // Gets a unit from an object pool. |
375 | BaseType AllocateUnit() { |
376 | BaseType index = 0; |
377 | if (unused_units_.empty()) { |
378 | index = static_cast<BaseType>(unit_pool_.Allocate()); |
379 | } else { |
380 | index = unused_units_.top(); |
381 | unused_units_.pop(); |
382 | } |
383 | unit_pool_[index].Clear(); |
384 | return index; |
385 | } |
386 | |
387 | // Returns a unit to an object pool. |
388 | void FreeUnit(BaseType index) { |
389 | unused_units_.push(index); |
390 | } |
391 | }; |
392 | |
393 | } // namespace dawgdic |
394 | |
395 | #endif // DAWGDIC_DAWG_BUILDER_H |
396 | |