1 | #include <dawgdic/dawg-builder.h> |
2 | #include <dawgdic/dictionary-builder.h> |
3 | #include <dawgdic/guide-builder.h> |
4 | #include <dawgdic/ranked-guide-builder.h> |
5 | |
6 | #include <cstdlib> |
7 | #include <fstream> |
8 | #include <iostream> |
9 | #include <limits> |
10 | #include <string> |
11 | |
12 | namespace { |
13 | |
14 | class CommandOptions { |
15 | public: |
16 | CommandOptions() |
17 | : help_(false), tab_(false), guide_(false), ranked_(false), |
18 | lexicon_file_name_(), dic_file_name_() {} |
19 | |
20 | // Reads options. |
21 | bool help() const { |
22 | return help_; |
23 | } |
24 | bool tab() const { |
25 | return tab_; |
26 | } |
27 | bool guide() const { |
28 | return guide_; |
29 | } |
30 | bool ranked() const { |
31 | return ranked_; |
32 | } |
33 | const std::string &lexicon_file_name() const { |
34 | return lexicon_file_name_; |
35 | } |
36 | const std::string &dic_file_name() const { |
37 | return dic_file_name_; |
38 | } |
39 | |
40 | bool Parse(int argc, char *argv[]) { |
41 | for (int i = 1; i < argc; ++i) { |
42 | // Parses options. |
43 | if (argv[i][0] == '-' && argv[i][1] != '\0') { |
44 | for (int j = 1; argv[i][j] != '\0'; ++j) { |
45 | switch (argv[i][j]) { |
46 | case 'h': { |
47 | help_ = true; |
48 | break; |
49 | } |
50 | case 't': { |
51 | tab_ = true; |
52 | break; |
53 | } |
54 | case 'g': { |
55 | guide_ = true; |
56 | break; |
57 | } |
58 | case 'r': { |
59 | ranked_ = true; |
60 | break; |
61 | } |
62 | default: { |
63 | // Invalid option. |
64 | return false; |
65 | } |
66 | } |
67 | } |
68 | } else if (lexicon_file_name_.empty()) { |
69 | lexicon_file_name_ = argv[i]; |
70 | } else if (dic_file_name_.empty()) { |
71 | dic_file_name_ = argv[i]; |
72 | } else { |
73 | // Too many arguments. |
74 | return false; |
75 | } |
76 | } |
77 | |
78 | // Uses default settings for file names. |
79 | if (lexicon_file_name_.empty()) { |
80 | lexicon_file_name_ = "-" ; |
81 | } |
82 | if (dic_file_name_.empty()) { |
83 | dic_file_name_ = "-" ; |
84 | } |
85 | return true; |
86 | } |
87 | |
88 | static void ShowUsage(std::ostream *output) { |
89 | *output << "Usage: - [Options] [LexiconFile] [DicFile]\n" |
90 | "\n" |
91 | "Options:\n" |
92 | " -h display this help and exit\n" |
93 | " -t handle tab as separator\n" |
94 | " -g build dictionary with guide\n" |
95 | " -r build dictionary with ranked guide\n" ; |
96 | *output << std::endl; |
97 | } |
98 | |
99 | private: |
100 | bool help_; |
101 | bool tab_; |
102 | bool guide_; |
103 | bool ranked_; |
104 | std::string lexicon_file_name_; |
105 | std::string dic_file_name_; |
106 | |
107 | // Disallows copies. |
108 | CommandOptions(const CommandOptions &); |
109 | CommandOptions &operator=(const CommandOptions &); |
110 | }; |
111 | |
112 | // Builds a dawg from a sorted lexicon. |
113 | bool BuildDawg(std::istream *lexicon_stream, |
114 | dawgdic::Dawg *dawg, bool tab_on) { |
115 | dawgdic::DawgBuilder dawg_builder; |
116 | |
117 | // Reads keys from an input stream and inserts them into a dawg. |
118 | std::string key; |
119 | std::size_t key_count = 0; |
120 | while (std::getline(*lexicon_stream, key)) { |
121 | std::string::size_type delim_pos = std::string::npos; |
122 | if (tab_on) { |
123 | delim_pos = key.find_first_of('\t'); |
124 | } |
125 | |
126 | if (delim_pos == std::string::npos) { |
127 | if (!dawg_builder.Insert(key.c_str())) { |
128 | std::cerr << "error: failed to insert key: " |
129 | << key << std::endl; |
130 | return false; |
131 | } |
132 | } else { |
133 | static const dawgdic::ValueType MAX_VALUE = |
134 | std::numeric_limits<dawgdic::ValueType>::max(); |
135 | |
136 | // Fixes an invalid record value. |
137 | long long record = std::strtoll(key.c_str() + delim_pos + 1, NULL, 10); |
138 | dawgdic::ValueType value = |
139 | static_cast<dawgdic::ValueType>(record); |
140 | if (record < 0) { |
141 | std::cerr << "warning: negative value is replaced by 0: " |
142 | << record << std::endl; |
143 | value = 0; |
144 | } else if (record > MAX_VALUE) { |
145 | std::cerr << "warning: too large value is replaced by " |
146 | << MAX_VALUE << ": " << record << std::endl; |
147 | value = MAX_VALUE; |
148 | } |
149 | |
150 | if (!dawg_builder.Insert(key.c_str(), delim_pos, value)) { |
151 | std::cerr << "error: failed to insert key: " |
152 | << key << std::endl; |
153 | return false; |
154 | } |
155 | } |
156 | |
157 | if (++key_count % 10000 == 0) { |
158 | std::cerr << "no. keys: " << key_count << '\r'; |
159 | } |
160 | } |
161 | |
162 | dawg_builder.Finish(dawg); |
163 | |
164 | std::cerr << "no. keys: " << key_count << std::endl; |
165 | std::cerr << "no. states: " |
166 | << dawg->num_of_states() << std::endl; |
167 | std::cerr << "no. transitions: " |
168 | << dawg->num_of_transitions() << std::endl; |
169 | std::cerr << "no. merged states: " |
170 | << dawg->num_of_merged_states() << std::endl; |
171 | std::cerr << "no. merging states: " |
172 | << dawg->num_of_merging_states() << std::endl; |
173 | std::cerr << "no. merged transitions: " |
174 | << dawg->num_of_merged_transitions() << std::endl; |
175 | |
176 | return true; |
177 | } |
178 | |
179 | // Builds a dictionary from a dawg. |
180 | bool BuildDictionary(const dawgdic::Dawg &dawg, dawgdic::Dictionary *dic) { |
181 | dawgdic::BaseType num_of_unused_units = 0; |
182 | if (!dawgdic::DictionaryBuilder::Build(dawg, dic, &num_of_unused_units)) { |
183 | std::cerr << "error: failed to build Dictionary" << std::endl; |
184 | return false; |
185 | } |
186 | double unused_ratio = 100.0 * num_of_unused_units / dic->size(); |
187 | |
188 | std::cerr << "no. elements: " << dic->size() << std::endl; |
189 | std::cerr << "no. unused elements: " << num_of_unused_units |
190 | << " (" << unused_ratio << "%)" << std::endl; |
191 | std::cerr << "dictionary size: " << dic->total_size() << std::endl; |
192 | |
193 | return true; |
194 | } |
195 | |
196 | // Builds a ranked guide from a dawg and its dictionary. |
197 | bool BuildRankedGuide(const dawgdic::Dawg &dawg, |
198 | const dawgdic::Dictionary &dic, |
199 | dawgdic::RankedGuide *guide) { |
200 | if (!dawgdic::RankedGuideBuilder::Build(dawg, dic, guide)) { |
201 | std::cerr << "failed to build RankedGuide" << std::endl; |
202 | return false; |
203 | } |
204 | |
205 | std::cerr << "no. units: " << guide->size() << std::endl; |
206 | std::cerr << "guide size: " << guide->total_size() << std::endl; |
207 | |
208 | return true; |
209 | } |
210 | |
211 | // Builds a guide from a dawg and its dictionary. |
212 | bool BuildGuide(const dawgdic::Dawg &dawg, |
213 | const dawgdic::Dictionary &dic, dawgdic::Guide *guide) { |
214 | if (!dawgdic::GuideBuilder::Build(dawg, dic, guide)) { |
215 | std::cerr << "failed to build Guide" << std::endl; |
216 | return false; |
217 | } |
218 | |
219 | std::cerr << "no. units: " << guide->size() << std::endl; |
220 | std::cerr << "guide size: " << guide->total_size() << std::endl; |
221 | |
222 | return true; |
223 | } |
224 | |
225 | } // namespace |
226 | |
227 | int main(int argc, char *argv[]) { |
228 | CommandOptions options; |
229 | if (!options.Parse(argc, argv)) { |
230 | CommandOptions::ShowUsage(&std::cerr); |
231 | return 1; |
232 | } else if (options.help()) { |
233 | CommandOptions::ShowUsage(&std::cerr); |
234 | return 0; |
235 | } |
236 | |
237 | const std::string &lexicon_file_name = options.lexicon_file_name(); |
238 | const std::string &dic_file_name = options.dic_file_name(); |
239 | |
240 | std::istream *lexicon_stream = &std::cin; |
241 | std::ostream *dic_stream = &std::cout; |
242 | |
243 | // Opens a lexicon file. |
244 | std::ifstream lexicon_file; |
245 | if (lexicon_file_name != "-" ) { |
246 | lexicon_file.open(lexicon_file_name.c_str(), std::ios::binary); |
247 | if (!lexicon_file) { |
248 | std::cerr << "error: failed to open LexiconFile: " |
249 | << lexicon_file_name << std::endl; |
250 | return 1; |
251 | } |
252 | lexicon_stream = &lexicon_file; |
253 | } |
254 | |
255 | // Opens a dictionary file. |
256 | std::ofstream dic_file; |
257 | if (dic_file_name != "-" ) { |
258 | dic_file.open(dic_file_name.c_str(), std::ios::binary); |
259 | if (!dic_file) { |
260 | std::cerr << "error: failed to open DicFile: " |
261 | << dic_file_name << std::endl; |
262 | return 1; |
263 | } |
264 | dic_stream = &dic_file; |
265 | } |
266 | |
267 | dawgdic::Dawg dawg; |
268 | if (!BuildDawg(lexicon_stream, &dawg, options.tab())) { |
269 | return 1; |
270 | } |
271 | |
272 | dawgdic::Dictionary dic; |
273 | if (!BuildDictionary(dawg, &dic)) { |
274 | return 1; |
275 | } |
276 | |
277 | if (!dic.Write(dic_stream)) { |
278 | std::cerr << "error: failed to write Dictionary" << std::endl; |
279 | return 1; |
280 | } |
281 | |
282 | // Builds a guide. |
283 | if (options.ranked()) { |
284 | dawgdic::RankedGuide guide; |
285 | if (!BuildRankedGuide(dawg, dic, &guide)) { |
286 | return 1; |
287 | } |
288 | if (!guide.Write(dic_stream)) { |
289 | std::cerr << "error: failed to write RankedGuide" << std::endl; |
290 | return 1; |
291 | } |
292 | } else if (options.guide()) { |
293 | dawgdic::Guide guide; |
294 | if (!BuildGuide(dawg, dic, &guide)) { |
295 | return 1; |
296 | } |
297 | if (!guide.Write(dic_stream)) { |
298 | std::cerr << "error: failed to write Guide" << std::endl; |
299 | return 1; |
300 | } |
301 | } |
302 | |
303 | return 0; |
304 | } |
305 | |