| 1 | /* -*- c-basic-offset: 2 -*- */ |
| 2 | /* Copyright(C) 2012-2015 Brazil |
| 3 | |
| 4 | This library is free software; you can redistribute it and/or |
| 5 | modify it under the terms of the GNU Lesser General Public |
| 6 | License version 2.1 as published by the Free Software Foundation. |
| 7 | |
| 8 | This library is distributed in the hope that it will be useful, |
| 9 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 11 | Lesser General Public License for more details. |
| 12 | |
| 13 | You should have received a copy of the GNU Lesser General Public |
| 14 | License along with this library; if not, write to the Free Software |
| 15 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 16 | */ |
| 17 | |
| 18 | #ifdef GRN_EMBEDDED |
| 19 | # define GRN_PLUGIN_FUNCTION_TAG query_expanders_tsv |
| 20 | #endif |
| 21 | |
| 22 | #ifdef HAVE_CONFIG_H |
| 23 | # include <config.h> |
| 24 | #endif /* HAVE_CONFIG_H */ |
| 25 | |
| 26 | #include <groonga/plugin.h> |
| 27 | |
| 28 | #include <stdlib.h> |
| 29 | #include <string.h> |
| 30 | |
| 31 | #ifdef WIN32 |
| 32 | # include <windows.h> |
| 33 | # include <share.h> |
| 34 | #endif /* WIN32 */ |
| 35 | |
| 36 | #define MAX_SYNONYM_BYTES 4096 |
| 37 | |
| 38 | static grn_hash *synonyms = NULL; |
| 39 | |
| 40 | #ifdef WIN32 |
| 41 | static char win32_synonyms_file[MAX_PATH] = "" ; |
| 42 | const char * |
| 43 | get_system_synonyms_file(void) |
| 44 | { |
| 45 | if (win32_synonyms_file[0] == '\0') { |
| 46 | const char *base_dir; |
| 47 | const char *relative_path = GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE; |
| 48 | size_t base_dir_length; |
| 49 | |
| 50 | base_dir = grn_plugin_windows_base_dir(); |
| 51 | base_dir_length = strlen(base_dir); |
| 52 | grn_strcpy(win32_synonyms_file, MAX_PATH, base_dir); |
| 53 | grn_strcat(win32_synonyms_file, MAX_PATH, "/" ); |
| 54 | grn_strcat(win32_synonyms_file, MAX_PATH, relative_path); |
| 55 | } |
| 56 | return win32_synonyms_file; |
| 57 | } |
| 58 | |
| 59 | #else /* WIN32 */ |
| 60 | const char * |
| 61 | get_system_synonyms_file(void) |
| 62 | { |
| 63 | return GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE; |
| 64 | } |
| 65 | #endif /* WIN32 */ |
| 66 | |
| 67 | static grn_bool |
| 68 | (char character) |
| 69 | { |
| 70 | return character == '#'; |
| 71 | } |
| 72 | |
| 73 | static grn_encoding |
| 74 | detect_coding_part(grn_ctx *ctx, const char *line, size_t line_length) |
| 75 | { |
| 76 | grn_encoding encoding = GRN_ENC_NONE; |
| 77 | grn_obj null_terminated_line_buffer; |
| 78 | const char *c_line; |
| 79 | const char *coding_part_keyword = "coding: " ; |
| 80 | const char *coding_part; |
| 81 | const char *encoding_name; |
| 82 | |
| 83 | GRN_TEXT_INIT(&null_terminated_line_buffer, 0); |
| 84 | GRN_TEXT_PUT(ctx, &null_terminated_line_buffer, line, line_length); |
| 85 | GRN_TEXT_PUTC(ctx, &null_terminated_line_buffer, '\0'); |
| 86 | |
| 87 | c_line = GRN_TEXT_VALUE(&null_terminated_line_buffer); |
| 88 | coding_part = strstr(c_line, coding_part_keyword); |
| 89 | if (coding_part) { |
| 90 | encoding_name = coding_part + strlen(coding_part_keyword); |
| 91 | if (grn_strncasecmp(encoding_name, "utf-8" , strlen("utf-8" )) == 0 || |
| 92 | grn_strncasecmp(encoding_name, "utf8" , strlen("utf8" )) == 0) { |
| 93 | encoding = GRN_ENC_UTF8; |
| 94 | } else if (grn_strncasecmp(encoding_name, "sjis" , strlen("sjis" )) == 0 || |
| 95 | grn_strncasecmp(encoding_name, "Shift_JIS" , strlen("Shift_JIS" )) == 0) { |
| 96 | encoding = GRN_ENC_SJIS; |
| 97 | } else if (grn_strncasecmp(encoding_name, "EUC-JP" , strlen("EUC-JP" )) == 0 || |
| 98 | grn_strncasecmp(encoding_name, "euc_jp" , strlen("euc_jp" )) == 0) { |
| 99 | encoding = GRN_ENC_EUC_JP; |
| 100 | } else if (grn_strncasecmp(encoding_name, "latin1" , strlen("latin1" )) == 0) { |
| 101 | encoding = GRN_ENC_LATIN1; |
| 102 | } else if (grn_strncasecmp(encoding_name, "KOI8-R" , strlen("KOI8-R" )) == 0 || |
| 103 | grn_strncasecmp(encoding_name, "koi8r" , strlen("koi8r" )) == 0) { |
| 104 | encoding = GRN_ENC_KOI8R; |
| 105 | } |
| 106 | } else { |
| 107 | encoding = ctx->encoding; |
| 108 | } |
| 109 | GRN_OBJ_FIN(ctx, &null_terminated_line_buffer); |
| 110 | |
| 111 | return encoding; |
| 112 | } |
| 113 | |
| 114 | static grn_encoding |
| 115 | guess_encoding(grn_ctx *ctx, const char **line, size_t *line_length) |
| 116 | { |
| 117 | const char bom[] = {0xef, 0xbb, 0xbf}; |
| 118 | size_t bom_length = sizeof(bom); |
| 119 | |
| 120 | if (*line_length >= bom_length && memcmp(*line, bom, bom_length) == 0) { |
| 121 | *line += bom_length; |
| 122 | *line_length -= bom_length; |
| 123 | return GRN_ENC_UTF8; |
| 124 | } |
| 125 | |
| 126 | if (!is_comment_mark((*line)[0])) { |
| 127 | return ctx->encoding; |
| 128 | } |
| 129 | |
| 130 | return detect_coding_part(ctx, (*line) + 1, (*line_length) - 1); |
| 131 | } |
| 132 | |
| 133 | static void |
| 134 | parse_synonyms_file_line(grn_ctx *ctx, const char *line, size_t line_length, |
| 135 | grn_obj *key, grn_obj *value) |
| 136 | { |
| 137 | size_t i = 0; |
| 138 | |
| 139 | if (is_comment_mark(line[i])) { |
| 140 | return; |
| 141 | } |
| 142 | |
| 143 | while (i < line_length) { |
| 144 | char character = line[i]; |
| 145 | i++; |
| 146 | if (character == '\t') { |
| 147 | break; |
| 148 | } |
| 149 | GRN_TEXT_PUTC(ctx, key, character); |
| 150 | } |
| 151 | |
| 152 | if (i == line_length) { |
| 153 | return; |
| 154 | } |
| 155 | |
| 156 | GRN_TEXT_PUTS(ctx, value, "((" ); |
| 157 | while (i < line_length) { |
| 158 | char character = line[i]; |
| 159 | i++; |
| 160 | if (character == '\t') { |
| 161 | GRN_TEXT_PUTS(ctx, value, ") OR (" ); |
| 162 | } else { |
| 163 | GRN_TEXT_PUTC(ctx, value, character); |
| 164 | } |
| 165 | } |
| 166 | GRN_TEXT_PUTS(ctx, value, "))" ); |
| 167 | |
| 168 | { |
| 169 | grn_id id; |
| 170 | void *value_location = NULL; |
| 171 | |
| 172 | id = grn_hash_add(ctx, synonyms, GRN_TEXT_VALUE(key), GRN_TEXT_LEN(key), |
| 173 | &value_location, NULL); |
| 174 | if (id == GRN_ID_NIL) { |
| 175 | GRN_PLUGIN_LOG(ctx, GRN_LOG_WARNING, |
| 176 | "[plugin][query-expander][tsv] " |
| 177 | "failed to register key: <%.*s>" , |
| 178 | (int)GRN_TEXT_LEN(key), GRN_TEXT_VALUE(key)); |
| 179 | return; |
| 180 | } |
| 181 | |
| 182 | if (GRN_TEXT_LEN(value) <= MAX_SYNONYM_BYTES - 1) { |
| 183 | GRN_TEXT_PUTC(ctx, value, '\0'); |
| 184 | } else { |
| 185 | grn_bulk_truncate(ctx, value, MAX_SYNONYM_BYTES - 1); |
| 186 | GRN_TEXT_PUTC(ctx, value, '\0'); |
| 187 | } |
| 188 | grn_memcpy(value_location, GRN_TEXT_VALUE(value), GRN_TEXT_LEN(value)); |
| 189 | } |
| 190 | } |
| 191 | |
| 192 | static void |
| 193 | load_synonyms(grn_ctx *ctx) |
| 194 | { |
| 195 | static char path_env[GRN_ENV_BUFFER_SIZE]; |
| 196 | const char *path; |
| 197 | grn_file_reader *file_reader; |
| 198 | int number_of_lines; |
| 199 | grn_encoding encoding; |
| 200 | grn_obj line, key, value; |
| 201 | |
| 202 | grn_getenv("GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE" , |
| 203 | path_env, |
| 204 | GRN_ENV_BUFFER_SIZE); |
| 205 | if (path_env[0]) { |
| 206 | path = path_env; |
| 207 | } else { |
| 208 | path = get_system_synonyms_file(); |
| 209 | } |
| 210 | file_reader = grn_file_reader_open(ctx, path); |
| 211 | if (!file_reader) { |
| 212 | GRN_LOG(ctx, GRN_LOG_WARNING, |
| 213 | "[plugin][query-expander][tsv] " |
| 214 | "synonyms file doesn't exist: <%s>" , |
| 215 | path); |
| 216 | return; |
| 217 | } |
| 218 | |
| 219 | GRN_TEXT_INIT(&line, 0); |
| 220 | GRN_TEXT_INIT(&key, 0); |
| 221 | GRN_TEXT_INIT(&value, 0); |
| 222 | grn_bulk_reserve(ctx, &value, MAX_SYNONYM_BYTES); |
| 223 | number_of_lines = 0; |
| 224 | while (grn_file_reader_read_line(ctx, file_reader, &line) == GRN_SUCCESS) { |
| 225 | const char *line_value = GRN_TEXT_VALUE(&line); |
| 226 | size_t line_length = GRN_TEXT_LEN(&line); |
| 227 | |
| 228 | if (line_length > 0 && line_value[line_length - 1] == '\n') { |
| 229 | if (line_length > 1 && line_value[line_length - 2] == '\r') { |
| 230 | line_length -= 2; |
| 231 | } else { |
| 232 | line_length -= 1; |
| 233 | } |
| 234 | } |
| 235 | number_of_lines++; |
| 236 | if (number_of_lines == 1) { |
| 237 | encoding = guess_encoding(ctx, &line_value, &line_length); |
| 238 | } |
| 239 | GRN_BULK_REWIND(&key); |
| 240 | GRN_BULK_REWIND(&value); |
| 241 | parse_synonyms_file_line(ctx, line_value, line_length, &key, &value); |
| 242 | GRN_BULK_REWIND(&line); |
| 243 | } |
| 244 | GRN_OBJ_FIN(ctx, &line); |
| 245 | GRN_OBJ_FIN(ctx, &key); |
| 246 | GRN_OBJ_FIN(ctx, &value); |
| 247 | |
| 248 | grn_file_reader_close(ctx, file_reader); |
| 249 | } |
| 250 | |
| 251 | static grn_obj * |
| 252 | func_query_expander_tsv(grn_ctx *ctx, int nargs, grn_obj **args, |
| 253 | grn_user_data *user_data) |
| 254 | { |
| 255 | grn_rc rc = GRN_END_OF_DATA; |
| 256 | grn_id id; |
| 257 | grn_obj *term, *expanded_term; |
| 258 | void *value; |
| 259 | grn_obj *rc_object; |
| 260 | |
| 261 | term = args[0]; |
| 262 | expanded_term = args[1]; |
| 263 | id = grn_hash_get(ctx, synonyms, |
| 264 | GRN_TEXT_VALUE(term), GRN_TEXT_LEN(term), |
| 265 | &value); |
| 266 | if (id != GRN_ID_NIL) { |
| 267 | const char *query = value; |
| 268 | GRN_TEXT_PUTS(ctx, expanded_term, query); |
| 269 | rc = GRN_SUCCESS; |
| 270 | } |
| 271 | |
| 272 | rc_object = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_INT32, 0); |
| 273 | if (rc_object) { |
| 274 | GRN_INT32_SET(ctx, rc_object, rc); |
| 275 | } |
| 276 | |
| 277 | return rc_object; |
| 278 | } |
| 279 | |
| 280 | grn_rc |
| 281 | GRN_PLUGIN_INIT(grn_ctx *ctx) |
| 282 | { |
| 283 | if (!synonyms) { |
| 284 | synonyms = grn_hash_create(ctx, NULL, |
| 285 | GRN_TABLE_MAX_KEY_SIZE, |
| 286 | MAX_SYNONYM_BYTES, |
| 287 | GRN_OBJ_TABLE_HASH_KEY | GRN_OBJ_KEY_VAR_SIZE); |
| 288 | if (!synonyms) { |
| 289 | return ctx->rc; |
| 290 | } |
| 291 | load_synonyms(ctx); |
| 292 | } |
| 293 | return ctx->rc; |
| 294 | } |
| 295 | |
| 296 | grn_rc |
| 297 | GRN_PLUGIN_REGISTER(grn_ctx *ctx) |
| 298 | { |
| 299 | grn_proc_create(ctx, "QueryExpanderTSV" , strlen("QueryExpanderTSV" ), |
| 300 | GRN_PROC_FUNCTION, |
| 301 | func_query_expander_tsv, NULL, NULL, |
| 302 | 0, NULL); |
| 303 | return GRN_SUCCESS; |
| 304 | } |
| 305 | |
| 306 | grn_rc |
| 307 | GRN_PLUGIN_FIN(grn_ctx *ctx) |
| 308 | { |
| 309 | if (synonyms) { |
| 310 | grn_hash_close(ctx, synonyms); |
| 311 | synonyms = NULL; |
| 312 | } |
| 313 | return GRN_SUCCESS; |
| 314 | } |
| 315 | |