1 | /* -*- c-basic-offset: 2 -*- */ |
2 | /* Copyright(C) 2012-2015 Brazil |
3 | |
4 | This library is free software; you can redistribute it and/or |
5 | modify it under the terms of the GNU Lesser General Public |
6 | License version 2.1 as published by the Free Software Foundation. |
7 | |
8 | This library is distributed in the hope that it will be useful, |
9 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
11 | Lesser General Public License for more details. |
12 | |
13 | You should have received a copy of the GNU Lesser General Public |
14 | License along with this library; if not, write to the Free Software |
15 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
16 | */ |
17 | |
18 | #ifdef GRN_EMBEDDED |
19 | # define GRN_PLUGIN_FUNCTION_TAG query_expanders_tsv |
20 | #endif |
21 | |
22 | #ifdef HAVE_CONFIG_H |
23 | # include <config.h> |
24 | #endif /* HAVE_CONFIG_H */ |
25 | |
26 | #include <groonga/plugin.h> |
27 | |
28 | #include <stdlib.h> |
29 | #include <string.h> |
30 | |
31 | #ifdef WIN32 |
32 | # include <windows.h> |
33 | # include <share.h> |
34 | #endif /* WIN32 */ |
35 | |
36 | #define MAX_SYNONYM_BYTES 4096 |
37 | |
38 | static grn_hash *synonyms = NULL; |
39 | |
40 | #ifdef WIN32 |
41 | static char win32_synonyms_file[MAX_PATH] = "" ; |
42 | const char * |
43 | get_system_synonyms_file(void) |
44 | { |
45 | if (win32_synonyms_file[0] == '\0') { |
46 | const char *base_dir; |
47 | const char *relative_path = GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE; |
48 | size_t base_dir_length; |
49 | |
50 | base_dir = grn_plugin_windows_base_dir(); |
51 | base_dir_length = strlen(base_dir); |
52 | grn_strcpy(win32_synonyms_file, MAX_PATH, base_dir); |
53 | grn_strcat(win32_synonyms_file, MAX_PATH, "/" ); |
54 | grn_strcat(win32_synonyms_file, MAX_PATH, relative_path); |
55 | } |
56 | return win32_synonyms_file; |
57 | } |
58 | |
59 | #else /* WIN32 */ |
60 | const char * |
61 | get_system_synonyms_file(void) |
62 | { |
63 | return GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE; |
64 | } |
65 | #endif /* WIN32 */ |
66 | |
67 | static grn_bool |
68 | (char character) |
69 | { |
70 | return character == '#'; |
71 | } |
72 | |
73 | static grn_encoding |
74 | detect_coding_part(grn_ctx *ctx, const char *line, size_t line_length) |
75 | { |
76 | grn_encoding encoding = GRN_ENC_NONE; |
77 | grn_obj null_terminated_line_buffer; |
78 | const char *c_line; |
79 | const char *coding_part_keyword = "coding: " ; |
80 | const char *coding_part; |
81 | const char *encoding_name; |
82 | |
83 | GRN_TEXT_INIT(&null_terminated_line_buffer, 0); |
84 | GRN_TEXT_PUT(ctx, &null_terminated_line_buffer, line, line_length); |
85 | GRN_TEXT_PUTC(ctx, &null_terminated_line_buffer, '\0'); |
86 | |
87 | c_line = GRN_TEXT_VALUE(&null_terminated_line_buffer); |
88 | coding_part = strstr(c_line, coding_part_keyword); |
89 | if (coding_part) { |
90 | encoding_name = coding_part + strlen(coding_part_keyword); |
91 | if (grn_strncasecmp(encoding_name, "utf-8" , strlen("utf-8" )) == 0 || |
92 | grn_strncasecmp(encoding_name, "utf8" , strlen("utf8" )) == 0) { |
93 | encoding = GRN_ENC_UTF8; |
94 | } else if (grn_strncasecmp(encoding_name, "sjis" , strlen("sjis" )) == 0 || |
95 | grn_strncasecmp(encoding_name, "Shift_JIS" , strlen("Shift_JIS" )) == 0) { |
96 | encoding = GRN_ENC_SJIS; |
97 | } else if (grn_strncasecmp(encoding_name, "EUC-JP" , strlen("EUC-JP" )) == 0 || |
98 | grn_strncasecmp(encoding_name, "euc_jp" , strlen("euc_jp" )) == 0) { |
99 | encoding = GRN_ENC_EUC_JP; |
100 | } else if (grn_strncasecmp(encoding_name, "latin1" , strlen("latin1" )) == 0) { |
101 | encoding = GRN_ENC_LATIN1; |
102 | } else if (grn_strncasecmp(encoding_name, "KOI8-R" , strlen("KOI8-R" )) == 0 || |
103 | grn_strncasecmp(encoding_name, "koi8r" , strlen("koi8r" )) == 0) { |
104 | encoding = GRN_ENC_KOI8R; |
105 | } |
106 | } else { |
107 | encoding = ctx->encoding; |
108 | } |
109 | GRN_OBJ_FIN(ctx, &null_terminated_line_buffer); |
110 | |
111 | return encoding; |
112 | } |
113 | |
114 | static grn_encoding |
115 | guess_encoding(grn_ctx *ctx, const char **line, size_t *line_length) |
116 | { |
117 | const char bom[] = {0xef, 0xbb, 0xbf}; |
118 | size_t bom_length = sizeof(bom); |
119 | |
120 | if (*line_length >= bom_length && memcmp(*line, bom, bom_length) == 0) { |
121 | *line += bom_length; |
122 | *line_length -= bom_length; |
123 | return GRN_ENC_UTF8; |
124 | } |
125 | |
126 | if (!is_comment_mark((*line)[0])) { |
127 | return ctx->encoding; |
128 | } |
129 | |
130 | return detect_coding_part(ctx, (*line) + 1, (*line_length) - 1); |
131 | } |
132 | |
133 | static void |
134 | parse_synonyms_file_line(grn_ctx *ctx, const char *line, size_t line_length, |
135 | grn_obj *key, grn_obj *value) |
136 | { |
137 | size_t i = 0; |
138 | |
139 | if (is_comment_mark(line[i])) { |
140 | return; |
141 | } |
142 | |
143 | while (i < line_length) { |
144 | char character = line[i]; |
145 | i++; |
146 | if (character == '\t') { |
147 | break; |
148 | } |
149 | GRN_TEXT_PUTC(ctx, key, character); |
150 | } |
151 | |
152 | if (i == line_length) { |
153 | return; |
154 | } |
155 | |
156 | GRN_TEXT_PUTS(ctx, value, "((" ); |
157 | while (i < line_length) { |
158 | char character = line[i]; |
159 | i++; |
160 | if (character == '\t') { |
161 | GRN_TEXT_PUTS(ctx, value, ") OR (" ); |
162 | } else { |
163 | GRN_TEXT_PUTC(ctx, value, character); |
164 | } |
165 | } |
166 | GRN_TEXT_PUTS(ctx, value, "))" ); |
167 | |
168 | { |
169 | grn_id id; |
170 | void *value_location = NULL; |
171 | |
172 | id = grn_hash_add(ctx, synonyms, GRN_TEXT_VALUE(key), GRN_TEXT_LEN(key), |
173 | &value_location, NULL); |
174 | if (id == GRN_ID_NIL) { |
175 | GRN_PLUGIN_LOG(ctx, GRN_LOG_WARNING, |
176 | "[plugin][query-expander][tsv] " |
177 | "failed to register key: <%.*s>" , |
178 | (int)GRN_TEXT_LEN(key), GRN_TEXT_VALUE(key)); |
179 | return; |
180 | } |
181 | |
182 | if (GRN_TEXT_LEN(value) <= MAX_SYNONYM_BYTES - 1) { |
183 | GRN_TEXT_PUTC(ctx, value, '\0'); |
184 | } else { |
185 | grn_bulk_truncate(ctx, value, MAX_SYNONYM_BYTES - 1); |
186 | GRN_TEXT_PUTC(ctx, value, '\0'); |
187 | } |
188 | grn_memcpy(value_location, GRN_TEXT_VALUE(value), GRN_TEXT_LEN(value)); |
189 | } |
190 | } |
191 | |
192 | static void |
193 | load_synonyms(grn_ctx *ctx) |
194 | { |
195 | static char path_env[GRN_ENV_BUFFER_SIZE]; |
196 | const char *path; |
197 | grn_file_reader *file_reader; |
198 | int number_of_lines; |
199 | grn_encoding encoding; |
200 | grn_obj line, key, value; |
201 | |
202 | grn_getenv("GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE" , |
203 | path_env, |
204 | GRN_ENV_BUFFER_SIZE); |
205 | if (path_env[0]) { |
206 | path = path_env; |
207 | } else { |
208 | path = get_system_synonyms_file(); |
209 | } |
210 | file_reader = grn_file_reader_open(ctx, path); |
211 | if (!file_reader) { |
212 | GRN_LOG(ctx, GRN_LOG_WARNING, |
213 | "[plugin][query-expander][tsv] " |
214 | "synonyms file doesn't exist: <%s>" , |
215 | path); |
216 | return; |
217 | } |
218 | |
219 | GRN_TEXT_INIT(&line, 0); |
220 | GRN_TEXT_INIT(&key, 0); |
221 | GRN_TEXT_INIT(&value, 0); |
222 | grn_bulk_reserve(ctx, &value, MAX_SYNONYM_BYTES); |
223 | number_of_lines = 0; |
224 | while (grn_file_reader_read_line(ctx, file_reader, &line) == GRN_SUCCESS) { |
225 | const char *line_value = GRN_TEXT_VALUE(&line); |
226 | size_t line_length = GRN_TEXT_LEN(&line); |
227 | |
228 | if (line_length > 0 && line_value[line_length - 1] == '\n') { |
229 | if (line_length > 1 && line_value[line_length - 2] == '\r') { |
230 | line_length -= 2; |
231 | } else { |
232 | line_length -= 1; |
233 | } |
234 | } |
235 | number_of_lines++; |
236 | if (number_of_lines == 1) { |
237 | encoding = guess_encoding(ctx, &line_value, &line_length); |
238 | } |
239 | GRN_BULK_REWIND(&key); |
240 | GRN_BULK_REWIND(&value); |
241 | parse_synonyms_file_line(ctx, line_value, line_length, &key, &value); |
242 | GRN_BULK_REWIND(&line); |
243 | } |
244 | GRN_OBJ_FIN(ctx, &line); |
245 | GRN_OBJ_FIN(ctx, &key); |
246 | GRN_OBJ_FIN(ctx, &value); |
247 | |
248 | grn_file_reader_close(ctx, file_reader); |
249 | } |
250 | |
251 | static grn_obj * |
252 | func_query_expander_tsv(grn_ctx *ctx, int nargs, grn_obj **args, |
253 | grn_user_data *user_data) |
254 | { |
255 | grn_rc rc = GRN_END_OF_DATA; |
256 | grn_id id; |
257 | grn_obj *term, *expanded_term; |
258 | void *value; |
259 | grn_obj *rc_object; |
260 | |
261 | term = args[0]; |
262 | expanded_term = args[1]; |
263 | id = grn_hash_get(ctx, synonyms, |
264 | GRN_TEXT_VALUE(term), GRN_TEXT_LEN(term), |
265 | &value); |
266 | if (id != GRN_ID_NIL) { |
267 | const char *query = value; |
268 | GRN_TEXT_PUTS(ctx, expanded_term, query); |
269 | rc = GRN_SUCCESS; |
270 | } |
271 | |
272 | rc_object = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_INT32, 0); |
273 | if (rc_object) { |
274 | GRN_INT32_SET(ctx, rc_object, rc); |
275 | } |
276 | |
277 | return rc_object; |
278 | } |
279 | |
280 | grn_rc |
281 | GRN_PLUGIN_INIT(grn_ctx *ctx) |
282 | { |
283 | if (!synonyms) { |
284 | synonyms = grn_hash_create(ctx, NULL, |
285 | GRN_TABLE_MAX_KEY_SIZE, |
286 | MAX_SYNONYM_BYTES, |
287 | GRN_OBJ_TABLE_HASH_KEY | GRN_OBJ_KEY_VAR_SIZE); |
288 | if (!synonyms) { |
289 | return ctx->rc; |
290 | } |
291 | load_synonyms(ctx); |
292 | } |
293 | return ctx->rc; |
294 | } |
295 | |
296 | grn_rc |
297 | GRN_PLUGIN_REGISTER(grn_ctx *ctx) |
298 | { |
299 | grn_proc_create(ctx, "QueryExpanderTSV" , strlen("QueryExpanderTSV" ), |
300 | GRN_PROC_FUNCTION, |
301 | func_query_expander_tsv, NULL, NULL, |
302 | 0, NULL); |
303 | return GRN_SUCCESS; |
304 | } |
305 | |
306 | grn_rc |
307 | GRN_PLUGIN_FIN(grn_ctx *ctx) |
308 | { |
309 | if (synonyms) { |
310 | grn_hash_close(ctx, synonyms); |
311 | synonyms = NULL; |
312 | } |
313 | return GRN_SUCCESS; |
314 | } |
315 | |