1#ifndef DAWGDIC_DICTIONARY_BUILDER_H
2#define DAWGDIC_DICTIONARY_BUILDER_H
3
4#include <vector>
5
6#include "dawg.h"
7#include "dictionary.h"
8#include "dictionary-extra-unit.h"
9#include "link-table.h"
10
11namespace dawgdic {
12
13class DictionaryBuilder {
14 public:
15 enum {
16 // Number of units in a block.
17 BLOCK_SIZE = 256,
18 // Number of blocks kept unfixed.
19 NUM_OF_UNFIXED_BLOCKS = 16,
20 // Number of units kept unfixed.
21 UNFIXED_SIZE = BLOCK_SIZE * NUM_OF_UNFIXED_BLOCKS
22 };
23
24 // Builds a dictionary from a list-form dawg.
25 static bool Build(const Dawg &dawg, Dictionary *dic,
26 BaseType *num_of_unused_units = NULL) {
27 DictionaryBuilder builder(dawg, dic);
28 if (!builder.BuildDictionary()) {
29 return false;
30 }
31 if (num_of_unused_units != NULL) {
32 *num_of_unused_units = builder.num_of_unused_units_;
33 }
34 return true;
35 }
36
37 private:
38 const Dawg &dawg_;
39 Dictionary *dic_;
40
41 std::vector<DictionaryUnit> units_;
42 std::vector<DictionaryExtraUnit *> extras_;
43 std::vector<UCharType> labels_;
44 LinkTable link_table_;
45 BaseType unfixed_index_;
46 BaseType num_of_unused_units_;
47
48 // Masks for offsets.
49 static const BaseType UPPER_MASK = ~(DictionaryUnit::OFFSET_MAX - 1);
50 static const BaseType LOWER_MASK = 0xFF;
51
52 // Disallows copies.
53 DictionaryBuilder(const DictionaryBuilder &);
54 DictionaryBuilder &operator=(const DictionaryBuilder &);
55
56 DictionaryBuilder(const Dawg &dawg, Dictionary *dic)
57 : dawg_(dawg), dic_(dic), units_(), extras_(), labels_(),
58 link_table_(), unfixed_index_(), num_of_unused_units_(0) {}
59 ~DictionaryBuilder() {
60 for (SizeType i = 0; i < extras_.size(); ++i) {
61 delete [] extras_[i];
62 }
63 }
64
65 // Accesses units.
66 DictionaryUnit &units(BaseType index) {
67 return units_[index];
68 }
69 const DictionaryUnit &units(BaseType index) const {
70 return units_[index];
71 }
72 DictionaryExtraUnit &extras(BaseType index) {
73 return extras_[index / BLOCK_SIZE][index % BLOCK_SIZE];
74 }
75 const DictionaryExtraUnit &extras(BaseType index) const {
76 return extras_[index / BLOCK_SIZE][index % BLOCK_SIZE];
77 }
78
79 // Number of units.
80 BaseType num_of_units() const {
81 return static_cast<BaseType>(units_.size());
82 }
83 // Number of blocks.
84 BaseType num_of_blocks() const {
85 return static_cast<BaseType>(extras_.size());
86 }
87
88 // Builds a dictionary from a list-form dawg.
89 bool BuildDictionary() {
90 link_table_.Init(dawg_.num_of_merging_states() +
91 (dawg_.num_of_merging_states() >> 1));
92
93 ReserveUnit(0);
94 extras(0).set_is_used();
95 units(0).set_offset(1);
96 units(0).set_label('\0');
97
98 if (dawg_.size() > 1) {
99 if (!BuildDictionary(dawg_.root(), 0)) {
100 return false;
101 }
102 }
103
104 FixAllBlocks();
105
106 dic_->SwapUnitsBuf(&units_);
107 return true;
108 }
109
110 // Builds a dictionary from a dawg.
111 bool BuildDictionary(BaseType dawg_index, BaseType dic_index) {
112 if (dawg_.is_leaf(dawg_index)) {
113 return true;
114 }
115
116 // Uses an existing offset if available.
117 BaseType dawg_child_index = dawg_.child(dawg_index);
118 if (dawg_.is_merging(dawg_child_index)) {
119 BaseType offset = link_table_.Find(dawg_child_index);
120 if (offset != 0) {
121 offset ^= dic_index;
122 if (!(offset & UPPER_MASK) || !(offset & LOWER_MASK)) {
123 if (dawg_.is_leaf(dawg_child_index)) {
124 units(dic_index).set_has_leaf();
125 }
126 units(dic_index).set_offset(offset);
127 return true;
128 }
129 }
130 }
131
132 // Finds a good offset and arranges child nodes.
133 BaseType offset = ArrangeChildNodes(dawg_index, dic_index);
134 if (offset == 0) {
135 return false;
136 }
137
138 if (dawg_.is_merging(dawg_child_index))
139 link_table_.Insert(dawg_child_index, offset); {
140 }
141
142 // Builds a double-array in depth-first order.
143 do {
144 BaseType dic_child_index = offset ^ dawg_.label(dawg_child_index);
145 if (!BuildDictionary(dawg_child_index, dic_child_index)) {
146 return false;
147 }
148 dawg_child_index = dawg_.sibling(dawg_child_index);
149 } while (dawg_child_index != 0);
150
151 return true;
152 }
153
154 // Arranges child nodes.
155 BaseType ArrangeChildNodes(BaseType dawg_index, BaseType dic_index) {
156 labels_.clear();
157
158 BaseType dawg_child_index = dawg_.child(dawg_index);
159 while (dawg_child_index != 0) {
160 labels_.push_back(dawg_.label(dawg_child_index));
161 dawg_child_index = dawg_.sibling(dawg_child_index);
162 }
163
164 // Finds a good offset.
165 BaseType offset = FindGoodOffset(dic_index);
166 if (!units(dic_index).set_offset(dic_index ^ offset)) {
167 return 0;
168 }
169
170 dawg_child_index = dawg_.child(dawg_index);
171 for (SizeType i = 0; i < labels_.size(); ++i) {
172 BaseType dic_child_index = offset ^ labels_[i];
173 ReserveUnit(dic_child_index);
174
175 if (dawg_.is_leaf(dawg_child_index)) {
176 units(dic_index).set_has_leaf();
177 units(dic_child_index).set_value(dawg_.value(dawg_child_index));
178 } else {
179 units(dic_child_index).set_label(labels_[i]);
180 }
181
182 dawg_child_index = dawg_.sibling(dawg_child_index);
183 }
184 extras(offset).set_is_used();
185
186 return offset;
187 }
188
189 // Finds a good offset.
190 BaseType FindGoodOffset(BaseType index) const {
191 if (unfixed_index_ >= num_of_units()) {
192 return num_of_units() | (index & 0xFF);
193 }
194
195 // Scans unused units to find a good offset.
196 BaseType unfixed_index = unfixed_index_;
197 do {
198 BaseType offset = unfixed_index ^ labels_[0];
199 if (IsGoodOffset(index, offset)) {
200 return offset;
201 }
202 unfixed_index = extras(unfixed_index).next();
203 } while (unfixed_index != unfixed_index_);
204
205 return num_of_units() | (index & 0xFF);
206 }
207
208 // Checks if a given offset is valid or not.
209 bool IsGoodOffset(BaseType index, BaseType offset) const {
210 if (extras(offset).is_used()) {
211 return false;
212 }
213
214 BaseType relative_offset = index ^ offset;
215 if ((relative_offset & LOWER_MASK) && (relative_offset & UPPER_MASK)) {
216 return false;
217 }
218
219 // Finds a collision.
220 for (SizeType i = 1; i < labels_.size(); ++i) {
221 if (extras(offset ^ labels_[i]).is_fixed()) {
222 return false;
223 }
224 }
225
226 return true;
227 }
228
229 // Reserves an unused unit.
230 void ReserveUnit(BaseType index) {
231 if (index >= num_of_units()) {
232 ExpandDictionary();
233 }
234
235 // Removes an unused unit from a circular linked list.
236 if (index == unfixed_index_) {
237 unfixed_index_ = extras(index).next();
238 if (unfixed_index_ == index) {
239 unfixed_index_ = num_of_units();
240 }
241 }
242 extras(extras(index).prev()).set_next(extras(index).next());
243 extras(extras(index).next()).set_prev(extras(index).prev());
244 extras(index).set_is_fixed();
245 }
246
247 // Expands a dictionary.
248 void ExpandDictionary() {
249 BaseType src_num_of_units = num_of_units();
250 BaseType src_num_of_blocks = num_of_blocks();
251
252 BaseType dest_num_of_units = src_num_of_units + BLOCK_SIZE;
253 BaseType dest_num_of_blocks = src_num_of_blocks + 1;
254
255 // Fixes an old block.
256 if (dest_num_of_blocks > NUM_OF_UNFIXED_BLOCKS) {
257 FixBlock(src_num_of_blocks - NUM_OF_UNFIXED_BLOCKS);
258 }
259
260 units_.resize(dest_num_of_units);
261 extras_.resize(dest_num_of_blocks, 0);
262
263 // Allocates memory to a new block.
264 if (dest_num_of_blocks > NUM_OF_UNFIXED_BLOCKS) {
265 BaseType block_id = src_num_of_blocks - NUM_OF_UNFIXED_BLOCKS;
266 std::swap(extras_[block_id], extras_.back());
267 for (BaseType i = src_num_of_units; i < dest_num_of_units; ++i) {
268 extras(i).clear();
269 }
270 } else {
271 extras_.back() = new DictionaryExtraUnit[BLOCK_SIZE];
272 }
273
274 // Creates a circular linked list for a new block.
275 for (BaseType i = src_num_of_units + 1; i < dest_num_of_units; ++i) {
276 extras(i - 1).set_next(i);
277 extras(i).set_prev(i - 1);
278 }
279
280 extras(src_num_of_units).set_prev(dest_num_of_units - 1);
281 extras(dest_num_of_units - 1).set_next(src_num_of_units);
282
283 // Merges 2 circular linked lists.
284 extras(src_num_of_units).set_prev(extras(unfixed_index_).prev());
285 extras(dest_num_of_units - 1).set_next(unfixed_index_);
286
287 extras(extras(unfixed_index_).prev()).set_next(src_num_of_units);
288 extras(unfixed_index_).set_prev(dest_num_of_units - 1);
289 }
290
291 // Fixes all blocks to avoid invalid transitions.
292 void FixAllBlocks() {
293 BaseType begin = 0;
294 if (num_of_blocks() > NUM_OF_UNFIXED_BLOCKS) {
295 begin = num_of_blocks() - NUM_OF_UNFIXED_BLOCKS;
296 }
297 BaseType end = num_of_blocks();
298
299 for (BaseType block_id = begin; block_id != end; ++block_id) {
300 FixBlock(block_id);
301 }
302 }
303
304 // Adjusts labels of unused units in a given block.
305 void FixBlock(BaseType block_id) {
306 BaseType begin = block_id * BLOCK_SIZE;
307 BaseType end = begin + BLOCK_SIZE;
308
309 // Finds an unused offset.
310 BaseType unused_offset_for_label = 0;
311 for (BaseType offset = begin; offset != end; ++offset) {
312 if (!extras(offset).is_used()) {
313 unused_offset_for_label = offset;
314 break;
315 }
316 }
317
318 // Labels of unused units are modified.
319 for (BaseType index = begin; index != end; ++index) {
320 if (!extras(index).is_fixed()) {
321 ReserveUnit(index);
322 units(index).set_label(
323 static_cast<UCharType>(index ^ unused_offset_for_label));
324 ++num_of_unused_units_;
325 }
326 }
327 }
328};
329
330} // namespace dawgdic
331
332#endif // DAWGDIC_DICTIONARY_BUILDER_H
333