1#include <dawgdic/dawg-builder.h>
2#include <dawgdic/dictionary-builder.h>
3#include <dawgdic/guide-builder.h>
4#include <dawgdic/ranked-guide-builder.h>
5
6#include <cstdlib>
7#include <fstream>
8#include <iostream>
9#include <limits>
10#include <string>
11
12namespace {
13
14class CommandOptions {
15 public:
16 CommandOptions()
17 : help_(false), tab_(false), guide_(false), ranked_(false),
18 lexicon_file_name_(), dic_file_name_() {}
19
20 // Reads options.
21 bool help() const {
22 return help_;
23 }
24 bool tab() const {
25 return tab_;
26 }
27 bool guide() const {
28 return guide_;
29 }
30 bool ranked() const {
31 return ranked_;
32 }
33 const std::string &lexicon_file_name() const {
34 return lexicon_file_name_;
35 }
36 const std::string &dic_file_name() const {
37 return dic_file_name_;
38 }
39
40 bool Parse(int argc, char *argv[]) {
41 for (int i = 1; i < argc; ++i) {
42 // Parses options.
43 if (argv[i][0] == '-' && argv[i][1] != '\0') {
44 for (int j = 1; argv[i][j] != '\0'; ++j) {
45 switch (argv[i][j]) {
46 case 'h': {
47 help_ = true;
48 break;
49 }
50 case 't': {
51 tab_ = true;
52 break;
53 }
54 case 'g': {
55 guide_ = true;
56 break;
57 }
58 case 'r': {
59 ranked_ = true;
60 break;
61 }
62 default: {
63 // Invalid option.
64 return false;
65 }
66 }
67 }
68 } else if (lexicon_file_name_.empty()) {
69 lexicon_file_name_ = argv[i];
70 } else if (dic_file_name_.empty()) {
71 dic_file_name_ = argv[i];
72 } else {
73 // Too many arguments.
74 return false;
75 }
76 }
77
78 // Uses default settings for file names.
79 if (lexicon_file_name_.empty()) {
80 lexicon_file_name_ = "-";
81 }
82 if (dic_file_name_.empty()) {
83 dic_file_name_ = "-";
84 }
85 return true;
86 }
87
88 static void ShowUsage(std::ostream *output) {
89 *output << "Usage: - [Options] [LexiconFile] [DicFile]\n"
90 "\n"
91 "Options:\n"
92 " -h display this help and exit\n"
93 " -t handle tab as separator\n"
94 " -g build dictionary with guide\n"
95 " -r build dictionary with ranked guide\n";
96 *output << std::endl;
97 }
98
99private:
100 bool help_;
101 bool tab_;
102 bool guide_;
103 bool ranked_;
104 std::string lexicon_file_name_;
105 std::string dic_file_name_;
106
107 // Disallows copies.
108 CommandOptions(const CommandOptions &);
109 CommandOptions &operator=(const CommandOptions &);
110};
111
112// Builds a dawg from a sorted lexicon.
113bool BuildDawg(std::istream *lexicon_stream,
114 dawgdic::Dawg *dawg, bool tab_on) {
115 dawgdic::DawgBuilder dawg_builder;
116
117 // Reads keys from an input stream and inserts them into a dawg.
118 std::string key;
119 std::size_t key_count = 0;
120 while (std::getline(*lexicon_stream, key)) {
121 std::string::size_type delim_pos = std::string::npos;
122 if (tab_on) {
123 delim_pos = key.find_first_of('\t');
124 }
125
126 if (delim_pos == std::string::npos) {
127 if (!dawg_builder.Insert(key.c_str())) {
128 std::cerr << "error: failed to insert key: "
129 << key << std::endl;
130 return false;
131 }
132 } else {
133 static const dawgdic::ValueType MAX_VALUE =
134 std::numeric_limits<dawgdic::ValueType>::max();
135
136 // Fixes an invalid record value.
137 long long record = std::strtoll(key.c_str() + delim_pos + 1, NULL, 10);
138 dawgdic::ValueType value =
139 static_cast<dawgdic::ValueType>(record);
140 if (record < 0) {
141 std::cerr << "warning: negative value is replaced by 0: "
142 << record << std::endl;
143 value = 0;
144 } else if (record > MAX_VALUE) {
145 std::cerr << "warning: too large value is replaced by "
146 << MAX_VALUE << ": " << record << std::endl;
147 value = MAX_VALUE;
148 }
149
150 if (!dawg_builder.Insert(key.c_str(), delim_pos, value)) {
151 std::cerr << "error: failed to insert key: "
152 << key << std::endl;
153 return false;
154 }
155 }
156
157 if (++key_count % 10000 == 0) {
158 std::cerr << "no. keys: " << key_count << '\r';
159 }
160 }
161
162 dawg_builder.Finish(dawg);
163
164 std::cerr << "no. keys: " << key_count << std::endl;
165 std::cerr << "no. states: "
166 << dawg->num_of_states() << std::endl;
167 std::cerr << "no. transitions: "
168 << dawg->num_of_transitions() << std::endl;
169 std::cerr << "no. merged states: "
170 << dawg->num_of_merged_states() << std::endl;
171 std::cerr << "no. merging states: "
172 << dawg->num_of_merging_states() << std::endl;
173 std::cerr << "no. merged transitions: "
174 << dawg->num_of_merged_transitions() << std::endl;
175
176 return true;
177}
178
179// Builds a dictionary from a dawg.
180bool BuildDictionary(const dawgdic::Dawg &dawg, dawgdic::Dictionary *dic) {
181 dawgdic::BaseType num_of_unused_units = 0;
182 if (!dawgdic::DictionaryBuilder::Build(dawg, dic, &num_of_unused_units)) {
183 std::cerr << "error: failed to build Dictionary" << std::endl;
184 return false;
185 }
186 double unused_ratio = 100.0 * num_of_unused_units / dic->size();
187
188 std::cerr << "no. elements: " << dic->size() << std::endl;
189 std::cerr << "no. unused elements: " << num_of_unused_units
190 << " (" << unused_ratio << "%)" << std::endl;
191 std::cerr << "dictionary size: " << dic->total_size() << std::endl;
192
193 return true;
194}
195
196// Builds a ranked guide from a dawg and its dictionary.
197bool BuildRankedGuide(const dawgdic::Dawg &dawg,
198 const dawgdic::Dictionary &dic,
199 dawgdic::RankedGuide *guide) {
200 if (!dawgdic::RankedGuideBuilder::Build(dawg, dic, guide)) {
201 std::cerr << "failed to build RankedGuide" << std::endl;
202 return false;
203 }
204
205 std::cerr << "no. units: " << guide->size() << std::endl;
206 std::cerr << "guide size: " << guide->total_size() << std::endl;
207
208 return true;
209}
210
211// Builds a guide from a dawg and its dictionary.
212bool BuildGuide(const dawgdic::Dawg &dawg,
213 const dawgdic::Dictionary &dic, dawgdic::Guide *guide) {
214 if (!dawgdic::GuideBuilder::Build(dawg, dic, guide)) {
215 std::cerr << "failed to build Guide" << std::endl;
216 return false;
217 }
218
219 std::cerr << "no. units: " << guide->size() << std::endl;
220 std::cerr << "guide size: " << guide->total_size() << std::endl;
221
222 return true;
223}
224
225} // namespace
226
227int main(int argc, char *argv[]) {
228 CommandOptions options;
229 if (!options.Parse(argc, argv)) {
230 CommandOptions::ShowUsage(&std::cerr);
231 return 1;
232 } else if (options.help()) {
233 CommandOptions::ShowUsage(&std::cerr);
234 return 0;
235 }
236
237 const std::string &lexicon_file_name = options.lexicon_file_name();
238 const std::string &dic_file_name = options.dic_file_name();
239
240 std::istream *lexicon_stream = &std::cin;
241 std::ostream *dic_stream = &std::cout;
242
243 // Opens a lexicon file.
244 std::ifstream lexicon_file;
245 if (lexicon_file_name != "-") {
246 lexicon_file.open(lexicon_file_name.c_str(), std::ios::binary);
247 if (!lexicon_file) {
248 std::cerr << "error: failed to open LexiconFile: "
249 << lexicon_file_name << std::endl;
250 return 1;
251 }
252 lexicon_stream = &lexicon_file;
253 }
254
255 // Opens a dictionary file.
256 std::ofstream dic_file;
257 if (dic_file_name != "-") {
258 dic_file.open(dic_file_name.c_str(), std::ios::binary);
259 if (!dic_file) {
260 std::cerr << "error: failed to open DicFile: "
261 << dic_file_name << std::endl;
262 return 1;
263 }
264 dic_stream = &dic_file;
265 }
266
267 dawgdic::Dawg dawg;
268 if (!BuildDawg(lexicon_stream, &dawg, options.tab())) {
269 return 1;
270 }
271
272 dawgdic::Dictionary dic;
273 if (!BuildDictionary(dawg, &dic)) {
274 return 1;
275 }
276
277 if (!dic.Write(dic_stream)) {
278 std::cerr << "error: failed to write Dictionary" << std::endl;
279 return 1;
280 }
281
282 // Builds a guide.
283 if (options.ranked()) {
284 dawgdic::RankedGuide guide;
285 if (!BuildRankedGuide(dawg, dic, &guide)) {
286 return 1;
287 }
288 if (!guide.Write(dic_stream)) {
289 std::cerr << "error: failed to write RankedGuide" << std::endl;
290 return 1;
291 }
292 } else if (options.guide()) {
293 dawgdic::Guide guide;
294 if (!BuildGuide(dawg, dic, &guide)) {
295 return 1;
296 }
297 if (!guide.Write(dic_stream)) {
298 std::cerr << "error: failed to write Guide" << std::endl;
299 return 1;
300 }
301 }
302
303 return 0;
304}
305