1/* -*- c-basic-offset: 2 -*- */
2/* Copyright(C) 2012-2015 Brazil
3
4 This library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU Lesser General Public
6 License version 2.1 as published by the Free Software Foundation.
7
8 This library is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 Lesser General Public License for more details.
12
13 You should have received a copy of the GNU Lesser General Public
14 License along with this library; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16*/
17
18#ifdef GRN_EMBEDDED
19# define GRN_PLUGIN_FUNCTION_TAG query_expanders_tsv
20#endif
21
22#ifdef HAVE_CONFIG_H
23# include <config.h>
24#endif /* HAVE_CONFIG_H */
25
26#include <groonga/plugin.h>
27
28#include <stdlib.h>
29#include <string.h>
30
31#ifdef WIN32
32# include <windows.h>
33# include <share.h>
34#endif /* WIN32 */
35
36#define MAX_SYNONYM_BYTES 4096
37
38static grn_hash *synonyms = NULL;
39
40#ifdef WIN32
41static char win32_synonyms_file[MAX_PATH] = "";
42const char *
43get_system_synonyms_file(void)
44{
45 if (win32_synonyms_file[0] == '\0') {
46 const char *base_dir;
47 const char *relative_path = GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE;
48 size_t base_dir_length;
49
50 base_dir = grn_plugin_windows_base_dir();
51 base_dir_length = strlen(base_dir);
52 grn_strcpy(win32_synonyms_file, MAX_PATH, base_dir);
53 grn_strcat(win32_synonyms_file, MAX_PATH, "/");
54 grn_strcat(win32_synonyms_file, MAX_PATH, relative_path);
55 }
56 return win32_synonyms_file;
57}
58
59#else /* WIN32 */
60const char *
61get_system_synonyms_file(void)
62{
63 return GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE;
64}
65#endif /* WIN32 */
66
67static grn_bool
68is_comment_mark(char character)
69{
70 return character == '#';
71}
72
73static grn_encoding
74detect_coding_part(grn_ctx *ctx, const char *line, size_t line_length)
75{
76 grn_encoding encoding = GRN_ENC_NONE;
77 grn_obj null_terminated_line_buffer;
78 const char *c_line;
79 const char *coding_part_keyword = "coding: ";
80 const char *coding_part;
81 const char *encoding_name;
82
83 GRN_TEXT_INIT(&null_terminated_line_buffer, 0);
84 GRN_TEXT_PUT(ctx, &null_terminated_line_buffer, line, line_length);
85 GRN_TEXT_PUTC(ctx, &null_terminated_line_buffer, '\0');
86
87 c_line = GRN_TEXT_VALUE(&null_terminated_line_buffer);
88 coding_part = strstr(c_line, coding_part_keyword);
89 if (coding_part) {
90 encoding_name = coding_part + strlen(coding_part_keyword);
91 if (grn_strncasecmp(encoding_name, "utf-8", strlen("utf-8")) == 0 ||
92 grn_strncasecmp(encoding_name, "utf8", strlen("utf8")) == 0) {
93 encoding = GRN_ENC_UTF8;
94 } else if (grn_strncasecmp(encoding_name, "sjis", strlen("sjis")) == 0 ||
95 grn_strncasecmp(encoding_name, "Shift_JIS", strlen("Shift_JIS")) == 0) {
96 encoding = GRN_ENC_SJIS;
97 } else if (grn_strncasecmp(encoding_name, "EUC-JP", strlen("EUC-JP")) == 0 ||
98 grn_strncasecmp(encoding_name, "euc_jp", strlen("euc_jp")) == 0) {
99 encoding = GRN_ENC_EUC_JP;
100 } else if (grn_strncasecmp(encoding_name, "latin1", strlen("latin1")) == 0) {
101 encoding = GRN_ENC_LATIN1;
102 } else if (grn_strncasecmp(encoding_name, "KOI8-R", strlen("KOI8-R")) == 0 ||
103 grn_strncasecmp(encoding_name, "koi8r", strlen("koi8r")) == 0) {
104 encoding = GRN_ENC_KOI8R;
105 }
106 } else {
107 encoding = ctx->encoding;
108 }
109 GRN_OBJ_FIN(ctx, &null_terminated_line_buffer);
110
111 return encoding;
112}
113
114static grn_encoding
115guess_encoding(grn_ctx *ctx, const char **line, size_t *line_length)
116{
117 const char bom[] = {0xef, 0xbb, 0xbf};
118 size_t bom_length = sizeof(bom);
119
120 if (*line_length >= bom_length && memcmp(*line, bom, bom_length) == 0) {
121 *line += bom_length;
122 *line_length -= bom_length;
123 return GRN_ENC_UTF8;
124 }
125
126 if (!is_comment_mark((*line)[0])) {
127 return ctx->encoding;
128 }
129
130 return detect_coding_part(ctx, (*line) + 1, (*line_length) - 1);
131}
132
133static void
134parse_synonyms_file_line(grn_ctx *ctx, const char *line, size_t line_length,
135 grn_obj *key, grn_obj *value)
136{
137 size_t i = 0;
138
139 if (is_comment_mark(line[i])) {
140 return;
141 }
142
143 while (i < line_length) {
144 char character = line[i];
145 i++;
146 if (character == '\t') {
147 break;
148 }
149 GRN_TEXT_PUTC(ctx, key, character);
150 }
151
152 if (i == line_length) {
153 return;
154 }
155
156 GRN_TEXT_PUTS(ctx, value, "((");
157 while (i < line_length) {
158 char character = line[i];
159 i++;
160 if (character == '\t') {
161 GRN_TEXT_PUTS(ctx, value, ") OR (");
162 } else {
163 GRN_TEXT_PUTC(ctx, value, character);
164 }
165 }
166 GRN_TEXT_PUTS(ctx, value, "))");
167
168 {
169 grn_id id;
170 void *value_location = NULL;
171
172 id = grn_hash_add(ctx, synonyms, GRN_TEXT_VALUE(key), GRN_TEXT_LEN(key),
173 &value_location, NULL);
174 if (id == GRN_ID_NIL) {
175 GRN_PLUGIN_LOG(ctx, GRN_LOG_WARNING,
176 "[plugin][query-expander][tsv] "
177 "failed to register key: <%.*s>",
178 (int)GRN_TEXT_LEN(key), GRN_TEXT_VALUE(key));
179 return;
180 }
181
182 if (GRN_TEXT_LEN(value) <= MAX_SYNONYM_BYTES - 1) {
183 GRN_TEXT_PUTC(ctx, value, '\0');
184 } else {
185 grn_bulk_truncate(ctx, value, MAX_SYNONYM_BYTES - 1);
186 GRN_TEXT_PUTC(ctx, value, '\0');
187 }
188 grn_memcpy(value_location, GRN_TEXT_VALUE(value), GRN_TEXT_LEN(value));
189 }
190}
191
192static void
193load_synonyms(grn_ctx *ctx)
194{
195 static char path_env[GRN_ENV_BUFFER_SIZE];
196 const char *path;
197 grn_file_reader *file_reader;
198 int number_of_lines;
199 grn_encoding encoding;
200 grn_obj line, key, value;
201
202 grn_getenv("GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE",
203 path_env,
204 GRN_ENV_BUFFER_SIZE);
205 if (path_env[0]) {
206 path = path_env;
207 } else {
208 path = get_system_synonyms_file();
209 }
210 file_reader = grn_file_reader_open(ctx, path);
211 if (!file_reader) {
212 GRN_LOG(ctx, GRN_LOG_WARNING,
213 "[plugin][query-expander][tsv] "
214 "synonyms file doesn't exist: <%s>",
215 path);
216 return;
217 }
218
219 GRN_TEXT_INIT(&line, 0);
220 GRN_TEXT_INIT(&key, 0);
221 GRN_TEXT_INIT(&value, 0);
222 grn_bulk_reserve(ctx, &value, MAX_SYNONYM_BYTES);
223 number_of_lines = 0;
224 while (grn_file_reader_read_line(ctx, file_reader, &line) == GRN_SUCCESS) {
225 const char *line_value = GRN_TEXT_VALUE(&line);
226 size_t line_length = GRN_TEXT_LEN(&line);
227
228 if (line_length > 0 && line_value[line_length - 1] == '\n') {
229 if (line_length > 1 && line_value[line_length - 2] == '\r') {
230 line_length -= 2;
231 } else {
232 line_length -= 1;
233 }
234 }
235 number_of_lines++;
236 if (number_of_lines == 1) {
237 encoding = guess_encoding(ctx, &line_value, &line_length);
238 }
239 GRN_BULK_REWIND(&key);
240 GRN_BULK_REWIND(&value);
241 parse_synonyms_file_line(ctx, line_value, line_length, &key, &value);
242 GRN_BULK_REWIND(&line);
243 }
244 GRN_OBJ_FIN(ctx, &line);
245 GRN_OBJ_FIN(ctx, &key);
246 GRN_OBJ_FIN(ctx, &value);
247
248 grn_file_reader_close(ctx, file_reader);
249}
250
251static grn_obj *
252func_query_expander_tsv(grn_ctx *ctx, int nargs, grn_obj **args,
253 grn_user_data *user_data)
254{
255 grn_rc rc = GRN_END_OF_DATA;
256 grn_id id;
257 grn_obj *term, *expanded_term;
258 void *value;
259 grn_obj *rc_object;
260
261 term = args[0];
262 expanded_term = args[1];
263 id = grn_hash_get(ctx, synonyms,
264 GRN_TEXT_VALUE(term), GRN_TEXT_LEN(term),
265 &value);
266 if (id != GRN_ID_NIL) {
267 const char *query = value;
268 GRN_TEXT_PUTS(ctx, expanded_term, query);
269 rc = GRN_SUCCESS;
270 }
271
272 rc_object = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_INT32, 0);
273 if (rc_object) {
274 GRN_INT32_SET(ctx, rc_object, rc);
275 }
276
277 return rc_object;
278}
279
280grn_rc
281GRN_PLUGIN_INIT(grn_ctx *ctx)
282{
283 if (!synonyms) {
284 synonyms = grn_hash_create(ctx, NULL,
285 GRN_TABLE_MAX_KEY_SIZE,
286 MAX_SYNONYM_BYTES,
287 GRN_OBJ_TABLE_HASH_KEY | GRN_OBJ_KEY_VAR_SIZE);
288 if (!synonyms) {
289 return ctx->rc;
290 }
291 load_synonyms(ctx);
292 }
293 return ctx->rc;
294}
295
296grn_rc
297GRN_PLUGIN_REGISTER(grn_ctx *ctx)
298{
299 grn_proc_create(ctx, "QueryExpanderTSV", strlen("QueryExpanderTSV"),
300 GRN_PROC_FUNCTION,
301 func_query_expander_tsv, NULL, NULL,
302 0, NULL);
303 return GRN_SUCCESS;
304}
305
306grn_rc
307GRN_PLUGIN_FIN(grn_ctx *ctx)
308{
309 if (synonyms) {
310 grn_hash_close(ctx, synonyms);
311 synonyms = NULL;
312 }
313 return GRN_SUCCESS;
314}
315