1 | #ifndef DAWGDIC_DICTIONARY_BUILDER_H |
2 | #define DAWGDIC_DICTIONARY_BUILDER_H |
3 | |
4 | #include <vector> |
5 | |
6 | #include "dawg.h" |
7 | #include "dictionary.h" |
8 | #include "dictionary-extra-unit.h" |
9 | #include "link-table.h" |
10 | |
11 | namespace dawgdic { |
12 | |
13 | class DictionaryBuilder { |
14 | public: |
15 | enum { |
16 | // Number of units in a block. |
17 | BLOCK_SIZE = 256, |
18 | // Number of blocks kept unfixed. |
19 | NUM_OF_UNFIXED_BLOCKS = 16, |
20 | // Number of units kept unfixed. |
21 | UNFIXED_SIZE = BLOCK_SIZE * NUM_OF_UNFIXED_BLOCKS |
22 | }; |
23 | |
24 | // Builds a dictionary from a list-form dawg. |
25 | static bool Build(const Dawg &dawg, Dictionary *dic, |
26 | BaseType *num_of_unused_units = NULL) { |
27 | DictionaryBuilder builder(dawg, dic); |
28 | if (!builder.BuildDictionary()) { |
29 | return false; |
30 | } |
31 | if (num_of_unused_units != NULL) { |
32 | *num_of_unused_units = builder.num_of_unused_units_; |
33 | } |
34 | return true; |
35 | } |
36 | |
37 | private: |
38 | const Dawg &dawg_; |
39 | Dictionary *dic_; |
40 | |
41 | std::vector<DictionaryUnit> units_; |
42 | std::vector<DictionaryExtraUnit *> ; |
43 | std::vector<UCharType> labels_; |
44 | LinkTable link_table_; |
45 | BaseType unfixed_index_; |
46 | BaseType num_of_unused_units_; |
47 | |
48 | // Masks for offsets. |
49 | static const BaseType UPPER_MASK = ~(DictionaryUnit::OFFSET_MAX - 1); |
50 | static const BaseType LOWER_MASK = 0xFF; |
51 | |
52 | // Disallows copies. |
53 | DictionaryBuilder(const DictionaryBuilder &); |
54 | DictionaryBuilder &operator=(const DictionaryBuilder &); |
55 | |
56 | DictionaryBuilder(const Dawg &dawg, Dictionary *dic) |
57 | : dawg_(dawg), dic_(dic), units_(), extras_(), labels_(), |
58 | link_table_(), unfixed_index_(), num_of_unused_units_(0) {} |
59 | ~DictionaryBuilder() { |
60 | for (SizeType i = 0; i < extras_.size(); ++i) { |
61 | delete [] extras_[i]; |
62 | } |
63 | } |
64 | |
65 | // Accesses units. |
66 | DictionaryUnit &units(BaseType index) { |
67 | return units_[index]; |
68 | } |
69 | const DictionaryUnit &units(BaseType index) const { |
70 | return units_[index]; |
71 | } |
72 | DictionaryExtraUnit &(BaseType index) { |
73 | return extras_[index / BLOCK_SIZE][index % BLOCK_SIZE]; |
74 | } |
75 | const DictionaryExtraUnit &(BaseType index) const { |
76 | return extras_[index / BLOCK_SIZE][index % BLOCK_SIZE]; |
77 | } |
78 | |
79 | // Number of units. |
80 | BaseType num_of_units() const { |
81 | return static_cast<BaseType>(units_.size()); |
82 | } |
83 | // Number of blocks. |
84 | BaseType num_of_blocks() const { |
85 | return static_cast<BaseType>(extras_.size()); |
86 | } |
87 | |
88 | // Builds a dictionary from a list-form dawg. |
89 | bool BuildDictionary() { |
90 | link_table_.Init(dawg_.num_of_merging_states() + |
91 | (dawg_.num_of_merging_states() >> 1)); |
92 | |
93 | ReserveUnit(0); |
94 | extras(0).set_is_used(); |
95 | units(0).set_offset(1); |
96 | units(0).set_label('\0'); |
97 | |
98 | if (dawg_.size() > 1) { |
99 | if (!BuildDictionary(dawg_.root(), 0)) { |
100 | return false; |
101 | } |
102 | } |
103 | |
104 | FixAllBlocks(); |
105 | |
106 | dic_->SwapUnitsBuf(&units_); |
107 | return true; |
108 | } |
109 | |
110 | // Builds a dictionary from a dawg. |
111 | bool BuildDictionary(BaseType dawg_index, BaseType dic_index) { |
112 | if (dawg_.is_leaf(dawg_index)) { |
113 | return true; |
114 | } |
115 | |
116 | // Uses an existing offset if available. |
117 | BaseType dawg_child_index = dawg_.child(dawg_index); |
118 | if (dawg_.is_merging(dawg_child_index)) { |
119 | BaseType offset = link_table_.Find(dawg_child_index); |
120 | if (offset != 0) { |
121 | offset ^= dic_index; |
122 | if (!(offset & UPPER_MASK) || !(offset & LOWER_MASK)) { |
123 | if (dawg_.is_leaf(dawg_child_index)) { |
124 | units(dic_index).set_has_leaf(); |
125 | } |
126 | units(dic_index).set_offset(offset); |
127 | return true; |
128 | } |
129 | } |
130 | } |
131 | |
132 | // Finds a good offset and arranges child nodes. |
133 | BaseType offset = ArrangeChildNodes(dawg_index, dic_index); |
134 | if (offset == 0) { |
135 | return false; |
136 | } |
137 | |
138 | if (dawg_.is_merging(dawg_child_index)) |
139 | link_table_.Insert(dawg_child_index, offset); { |
140 | } |
141 | |
142 | // Builds a double-array in depth-first order. |
143 | do { |
144 | BaseType dic_child_index = offset ^ dawg_.label(dawg_child_index); |
145 | if (!BuildDictionary(dawg_child_index, dic_child_index)) { |
146 | return false; |
147 | } |
148 | dawg_child_index = dawg_.sibling(dawg_child_index); |
149 | } while (dawg_child_index != 0); |
150 | |
151 | return true; |
152 | } |
153 | |
154 | // Arranges child nodes. |
155 | BaseType ArrangeChildNodes(BaseType dawg_index, BaseType dic_index) { |
156 | labels_.clear(); |
157 | |
158 | BaseType dawg_child_index = dawg_.child(dawg_index); |
159 | while (dawg_child_index != 0) { |
160 | labels_.push_back(dawg_.label(dawg_child_index)); |
161 | dawg_child_index = dawg_.sibling(dawg_child_index); |
162 | } |
163 | |
164 | // Finds a good offset. |
165 | BaseType offset = FindGoodOffset(dic_index); |
166 | if (!units(dic_index).set_offset(dic_index ^ offset)) { |
167 | return 0; |
168 | } |
169 | |
170 | dawg_child_index = dawg_.child(dawg_index); |
171 | for (SizeType i = 0; i < labels_.size(); ++i) { |
172 | BaseType dic_child_index = offset ^ labels_[i]; |
173 | ReserveUnit(dic_child_index); |
174 | |
175 | if (dawg_.is_leaf(dawg_child_index)) { |
176 | units(dic_index).set_has_leaf(); |
177 | units(dic_child_index).set_value(dawg_.value(dawg_child_index)); |
178 | } else { |
179 | units(dic_child_index).set_label(labels_[i]); |
180 | } |
181 | |
182 | dawg_child_index = dawg_.sibling(dawg_child_index); |
183 | } |
184 | extras(offset).set_is_used(); |
185 | |
186 | return offset; |
187 | } |
188 | |
189 | // Finds a good offset. |
190 | BaseType FindGoodOffset(BaseType index) const { |
191 | if (unfixed_index_ >= num_of_units()) { |
192 | return num_of_units() | (index & 0xFF); |
193 | } |
194 | |
195 | // Scans unused units to find a good offset. |
196 | BaseType unfixed_index = unfixed_index_; |
197 | do { |
198 | BaseType offset = unfixed_index ^ labels_[0]; |
199 | if (IsGoodOffset(index, offset)) { |
200 | return offset; |
201 | } |
202 | unfixed_index = extras(unfixed_index).next(); |
203 | } while (unfixed_index != unfixed_index_); |
204 | |
205 | return num_of_units() | (index & 0xFF); |
206 | } |
207 | |
208 | // Checks if a given offset is valid or not. |
209 | bool IsGoodOffset(BaseType index, BaseType offset) const { |
210 | if (extras(offset).is_used()) { |
211 | return false; |
212 | } |
213 | |
214 | BaseType relative_offset = index ^ offset; |
215 | if ((relative_offset & LOWER_MASK) && (relative_offset & UPPER_MASK)) { |
216 | return false; |
217 | } |
218 | |
219 | // Finds a collision. |
220 | for (SizeType i = 1; i < labels_.size(); ++i) { |
221 | if (extras(offset ^ labels_[i]).is_fixed()) { |
222 | return false; |
223 | } |
224 | } |
225 | |
226 | return true; |
227 | } |
228 | |
229 | // Reserves an unused unit. |
230 | void ReserveUnit(BaseType index) { |
231 | if (index >= num_of_units()) { |
232 | ExpandDictionary(); |
233 | } |
234 | |
235 | // Removes an unused unit from a circular linked list. |
236 | if (index == unfixed_index_) { |
237 | unfixed_index_ = extras(index).next(); |
238 | if (unfixed_index_ == index) { |
239 | unfixed_index_ = num_of_units(); |
240 | } |
241 | } |
242 | extras(extras(index).prev()).set_next(extras(index).next()); |
243 | extras(extras(index).next()).set_prev(extras(index).prev()); |
244 | extras(index).set_is_fixed(); |
245 | } |
246 | |
247 | // Expands a dictionary. |
248 | void ExpandDictionary() { |
249 | BaseType src_num_of_units = num_of_units(); |
250 | BaseType src_num_of_blocks = num_of_blocks(); |
251 | |
252 | BaseType dest_num_of_units = src_num_of_units + BLOCK_SIZE; |
253 | BaseType dest_num_of_blocks = src_num_of_blocks + 1; |
254 | |
255 | // Fixes an old block. |
256 | if (dest_num_of_blocks > NUM_OF_UNFIXED_BLOCKS) { |
257 | FixBlock(src_num_of_blocks - NUM_OF_UNFIXED_BLOCKS); |
258 | } |
259 | |
260 | units_.resize(dest_num_of_units); |
261 | extras_.resize(dest_num_of_blocks, 0); |
262 | |
263 | // Allocates memory to a new block. |
264 | if (dest_num_of_blocks > NUM_OF_UNFIXED_BLOCKS) { |
265 | BaseType block_id = src_num_of_blocks - NUM_OF_UNFIXED_BLOCKS; |
266 | std::swap(extras_[block_id], extras_.back()); |
267 | for (BaseType i = src_num_of_units; i < dest_num_of_units; ++i) { |
268 | extras(i).clear(); |
269 | } |
270 | } else { |
271 | extras_.back() = new DictionaryExtraUnit[BLOCK_SIZE]; |
272 | } |
273 | |
274 | // Creates a circular linked list for a new block. |
275 | for (BaseType i = src_num_of_units + 1; i < dest_num_of_units; ++i) { |
276 | extras(i - 1).set_next(i); |
277 | extras(i).set_prev(i - 1); |
278 | } |
279 | |
280 | extras(src_num_of_units).set_prev(dest_num_of_units - 1); |
281 | extras(dest_num_of_units - 1).set_next(src_num_of_units); |
282 | |
283 | // Merges 2 circular linked lists. |
284 | extras(src_num_of_units).set_prev(extras(unfixed_index_).prev()); |
285 | extras(dest_num_of_units - 1).set_next(unfixed_index_); |
286 | |
287 | extras(extras(unfixed_index_).prev()).set_next(src_num_of_units); |
288 | extras(unfixed_index_).set_prev(dest_num_of_units - 1); |
289 | } |
290 | |
291 | // Fixes all blocks to avoid invalid transitions. |
292 | void FixAllBlocks() { |
293 | BaseType begin = 0; |
294 | if (num_of_blocks() > NUM_OF_UNFIXED_BLOCKS) { |
295 | begin = num_of_blocks() - NUM_OF_UNFIXED_BLOCKS; |
296 | } |
297 | BaseType end = num_of_blocks(); |
298 | |
299 | for (BaseType block_id = begin; block_id != end; ++block_id) { |
300 | FixBlock(block_id); |
301 | } |
302 | } |
303 | |
304 | // Adjusts labels of unused units in a given block. |
305 | void FixBlock(BaseType block_id) { |
306 | BaseType begin = block_id * BLOCK_SIZE; |
307 | BaseType end = begin + BLOCK_SIZE; |
308 | |
309 | // Finds an unused offset. |
310 | BaseType unused_offset_for_label = 0; |
311 | for (BaseType offset = begin; offset != end; ++offset) { |
312 | if (!extras(offset).is_used()) { |
313 | unused_offset_for_label = offset; |
314 | break; |
315 | } |
316 | } |
317 | |
318 | // Labels of unused units are modified. |
319 | for (BaseType index = begin; index != end; ++index) { |
320 | if (!extras(index).is_fixed()) { |
321 | ReserveUnit(index); |
322 | units(index).set_label( |
323 | static_cast<UCharType>(index ^ unused_offset_for_label)); |
324 | ++num_of_unused_units_; |
325 | } |
326 | } |
327 | } |
328 | }; |
329 | |
330 | } // namespace dawgdic |
331 | |
332 | #endif // DAWGDIC_DICTIONARY_BUILDER_H |
333 | |