1 | /* -*- c-basic-offset: 2 -*- */ |
2 | /* |
3 | Copyright(C) 2009-2016 Brazil |
4 | |
5 | This library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License version 2.1 as published by the Free Software Foundation. |
8 | |
9 | This library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Lesser General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU Lesser General Public |
15 | License along with this library; if not, write to the Free Software |
16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
17 | */ |
18 | |
19 | #include "../grn_proc.h" |
20 | #include "../grn_ctx.h" |
21 | #include "../grn_token_cursor.h" |
22 | |
23 | #include <groonga/plugin.h> |
24 | |
25 | static unsigned int |
26 | parse_tokenize_flags(grn_ctx *ctx, grn_obj *flag_names) |
27 | { |
28 | unsigned int flags = 0; |
29 | const char *names, *names_end; |
30 | int length; |
31 | |
32 | names = GRN_TEXT_VALUE(flag_names); |
33 | length = GRN_TEXT_LEN(flag_names); |
34 | names_end = names + length; |
35 | while (names < names_end) { |
36 | if (*names == '|' || *names == ' ') { |
37 | names += 1; |
38 | continue; |
39 | } |
40 | |
41 | #define CHECK_FLAG(name)\ |
42 | if (((names_end - names) >= (sizeof(#name) - 1)) &&\ |
43 | (!memcmp(names, #name, sizeof(#name) - 1))) {\ |
44 | flags |= GRN_TOKEN_CURSOR_ ## name;\ |
45 | names += sizeof(#name) - 1;\ |
46 | continue;\ |
47 | } |
48 | |
49 | CHECK_FLAG(ENABLE_TOKENIZED_DELIMITER); |
50 | |
51 | #define GRN_TOKEN_CURSOR_NONE 0 |
52 | CHECK_FLAG(NONE); |
53 | #undef GRN_TOKEN_CURSOR_NONE |
54 | |
55 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, |
56 | "[tokenize] invalid flag: <%.*s>" , |
57 | (int)(names_end - names), names); |
58 | return 0; |
59 | #undef CHECK_FLAG |
60 | } |
61 | |
62 | return flags; |
63 | } |
64 | |
65 | typedef struct { |
66 | grn_id id; |
67 | int32_t position; |
68 | grn_bool force_prefix; |
69 | } tokenize_token; |
70 | |
71 | static void |
72 | output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_obj *lexicon, grn_obj *index_column) |
73 | { |
74 | int i, n_tokens, n_elements; |
75 | grn_obj estimated_size; |
76 | |
77 | n_tokens = GRN_BULK_VSIZE(tokens) / sizeof(tokenize_token); |
78 | n_elements = 3; |
79 | if (index_column) { |
80 | n_elements++; |
81 | GRN_UINT32_INIT(&estimated_size, 0); |
82 | } |
83 | |
84 | grn_ctx_output_array_open(ctx, "TOKENS" , n_tokens); |
85 | for (i = 0; i < n_tokens; i++) { |
86 | tokenize_token *token; |
87 | char value[GRN_TABLE_MAX_KEY_SIZE]; |
88 | unsigned int value_size; |
89 | |
90 | token = ((tokenize_token *)(GRN_BULK_HEAD(tokens))) + i; |
91 | |
92 | grn_ctx_output_map_open(ctx, "TOKEN" , n_elements); |
93 | |
94 | grn_ctx_output_cstr(ctx, "value" ); |
95 | value_size = grn_table_get_key(ctx, lexicon, token->id, |
96 | value, GRN_TABLE_MAX_KEY_SIZE); |
97 | grn_ctx_output_str(ctx, value, value_size); |
98 | |
99 | grn_ctx_output_cstr(ctx, "position" ); |
100 | grn_ctx_output_int32(ctx, token->position); |
101 | |
102 | grn_ctx_output_cstr(ctx, "force_prefix" ); |
103 | grn_ctx_output_bool(ctx, token->force_prefix); |
104 | |
105 | if (index_column) { |
106 | GRN_BULK_REWIND(&estimated_size); |
107 | grn_obj_get_value(ctx, index_column, token->id, &estimated_size); |
108 | grn_ctx_output_cstr(ctx, "estimated_size" ); |
109 | grn_ctx_output_int64(ctx, GRN_UINT32_VALUE(&estimated_size)); |
110 | } |
111 | |
112 | grn_ctx_output_map_close(ctx); |
113 | } |
114 | |
115 | if (index_column) { |
116 | GRN_OBJ_FIN(ctx, &estimated_size); |
117 | } |
118 | |
119 | grn_ctx_output_array_close(ctx); |
120 | } |
121 | |
122 | static grn_obj * |
123 | create_lexicon_for_tokenize(grn_ctx *ctx, |
124 | grn_obj *tokenizer_name, |
125 | grn_obj *normalizer_name, |
126 | grn_obj *token_filter_names) |
127 | { |
128 | grn_obj *lexicon; |
129 | grn_obj *tokenizer; |
130 | grn_obj *normalizer = NULL; |
131 | |
132 | tokenizer = grn_ctx_get(ctx, |
133 | GRN_TEXT_VALUE(tokenizer_name), |
134 | GRN_TEXT_LEN(tokenizer_name)); |
135 | if (!tokenizer) { |
136 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, |
137 | "[tokenize] nonexistent tokenizer: <%.*s>" , |
138 | (int)GRN_TEXT_LEN(tokenizer_name), |
139 | GRN_TEXT_VALUE(tokenizer_name)); |
140 | return NULL; |
141 | } |
142 | |
143 | if (!grn_obj_is_tokenizer_proc(ctx, tokenizer)) { |
144 | grn_obj inspected; |
145 | GRN_TEXT_INIT(&inspected, 0); |
146 | grn_inspect(ctx, &inspected, tokenizer); |
147 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, |
148 | "[tokenize] not tokenizer: %.*s" , |
149 | (int)GRN_TEXT_LEN(&inspected), |
150 | GRN_TEXT_VALUE(&inspected)); |
151 | GRN_OBJ_FIN(ctx, &inspected); |
152 | grn_obj_unlink(ctx, tokenizer); |
153 | return NULL; |
154 | } |
155 | |
156 | if (GRN_TEXT_LEN(normalizer_name) > 0) { |
157 | normalizer = grn_ctx_get(ctx, |
158 | GRN_TEXT_VALUE(normalizer_name), |
159 | GRN_TEXT_LEN(normalizer_name)); |
160 | if (!normalizer) { |
161 | grn_obj_unlink(ctx, tokenizer); |
162 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, |
163 | "[tokenize] nonexistent normalizer: <%.*s>" , |
164 | (int)GRN_TEXT_LEN(normalizer_name), |
165 | GRN_TEXT_VALUE(normalizer_name)); |
166 | return NULL; |
167 | } |
168 | |
169 | if (!grn_obj_is_normalizer_proc(ctx, normalizer)) { |
170 | grn_obj inspected; |
171 | grn_obj_unlink(ctx, tokenizer); |
172 | GRN_TEXT_INIT(&inspected, 0); |
173 | grn_inspect(ctx, &inspected, normalizer); |
174 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, |
175 | "[tokenize] not normalizer: %.*s" , |
176 | (int)GRN_TEXT_LEN(&inspected), |
177 | GRN_TEXT_VALUE(&inspected)); |
178 | GRN_OBJ_FIN(ctx, &inspected); |
179 | grn_obj_unlink(ctx, normalizer); |
180 | return NULL; |
181 | } |
182 | } |
183 | |
184 | lexicon = grn_table_create(ctx, NULL, 0, |
185 | NULL, |
186 | GRN_OBJ_TABLE_HASH_KEY, |
187 | grn_ctx_at(ctx, GRN_DB_SHORT_TEXT), |
188 | NULL); |
189 | grn_obj_set_info(ctx, lexicon, |
190 | GRN_INFO_DEFAULT_TOKENIZER, tokenizer); |
191 | grn_obj_unlink(ctx, tokenizer); |
192 | if (normalizer) { |
193 | grn_obj_set_info(ctx, lexicon, |
194 | GRN_INFO_NORMALIZER, normalizer); |
195 | grn_obj_unlink(ctx, normalizer); |
196 | } |
197 | grn_proc_table_set_token_filters(ctx, lexicon, token_filter_names); |
198 | |
199 | return lexicon; |
200 | } |
201 | |
202 | static void |
203 | tokenize(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, grn_tokenize_mode mode, |
204 | unsigned int flags, grn_obj *tokens) |
205 | { |
206 | grn_token_cursor *token_cursor; |
207 | |
208 | token_cursor = |
209 | grn_token_cursor_open(ctx, lexicon, |
210 | GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string), |
211 | mode, flags); |
212 | if (!token_cursor) { |
213 | return; |
214 | } |
215 | |
216 | while (token_cursor->status == GRN_TOKEN_CURSOR_DOING) { |
217 | grn_id token_id = grn_token_cursor_next(ctx, token_cursor); |
218 | tokenize_token *current_token; |
219 | if (token_id == GRN_ID_NIL) { |
220 | continue; |
221 | } |
222 | grn_bulk_space(ctx, tokens, sizeof(tokenize_token)); |
223 | current_token = ((tokenize_token *)(GRN_BULK_CURR(tokens))) - 1; |
224 | current_token->id = token_id; |
225 | current_token->position = token_cursor->pos; |
226 | current_token->force_prefix = token_cursor->force_prefix; |
227 | } |
228 | grn_token_cursor_close(ctx, token_cursor); |
229 | } |
230 | |
231 | static grn_obj * |
232 | command_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
233 | { |
234 | grn_obj *table_name; |
235 | grn_obj *string; |
236 | grn_obj *flag_names; |
237 | grn_obj *mode_name; |
238 | grn_obj *index_column_name; |
239 | |
240 | table_name = grn_plugin_proc_get_var(ctx, user_data, "table" , -1); |
241 | string = grn_plugin_proc_get_var(ctx, user_data, "string" , -1); |
242 | flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags" , -1); |
243 | mode_name = grn_plugin_proc_get_var(ctx, user_data, "mode" , -1); |
244 | index_column_name = grn_plugin_proc_get_var(ctx, user_data, "index_column" , -1); |
245 | |
246 | if (GRN_TEXT_LEN(table_name) == 0) { |
247 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] table name is missing" ); |
248 | return NULL; |
249 | } |
250 | |
251 | if (GRN_TEXT_LEN(string) == 0) { |
252 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] string is missing" ); |
253 | return NULL; |
254 | } |
255 | |
256 | { |
257 | unsigned int flags; |
258 | grn_obj *lexicon; |
259 | grn_obj *index_column = NULL; |
260 | |
261 | flags = parse_tokenize_flags(ctx, flag_names); |
262 | if (ctx->rc != GRN_SUCCESS) { |
263 | return NULL; |
264 | } |
265 | |
266 | lexicon = grn_ctx_get(ctx, GRN_TEXT_VALUE(table_name), GRN_TEXT_LEN(table_name)); |
267 | if (!lexicon) { |
268 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, |
269 | "[table_tokenize] nonexistent lexicon: <%.*s>" , |
270 | (int)GRN_TEXT_LEN(table_name), |
271 | GRN_TEXT_VALUE(table_name)); |
272 | return NULL; |
273 | } |
274 | |
275 | #define MODE_NAME_EQUAL(name)\ |
276 | (GRN_TEXT_LEN(mode_name) == strlen(name) &&\ |
277 | memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0) |
278 | |
279 | if (GRN_TEXT_LEN(index_column_name) > 0) { |
280 | index_column = grn_obj_column(ctx, lexicon, |
281 | GRN_TEXT_VALUE(index_column_name), |
282 | GRN_TEXT_LEN(index_column_name)); |
283 | if (!index_column) { |
284 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, |
285 | "[table_tokenize] nonexistent index column: <%.*s>" , |
286 | (int)GRN_TEXT_LEN(index_column_name), |
287 | GRN_TEXT_VALUE(index_column_name)); |
288 | goto exit; |
289 | } |
290 | if (index_column->header.type != GRN_COLUMN_INDEX) { |
291 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, |
292 | "[table_tokenize] index column must be COLUMN_INDEX: <%.*s>" , |
293 | (int)GRN_TEXT_LEN(index_column_name), |
294 | GRN_TEXT_VALUE(index_column_name)); |
295 | goto exit; |
296 | } |
297 | } |
298 | |
299 | { |
300 | grn_obj tokens; |
301 | GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); |
302 | if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("GET" )) { |
303 | tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens); |
304 | output_tokens(ctx, &tokens, lexicon, index_column); |
305 | } else if (MODE_NAME_EQUAL("ADD" )) { |
306 | tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); |
307 | output_tokens(ctx, &tokens, lexicon, index_column); |
308 | } else { |
309 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, |
310 | "[table_tokenize] invalid mode: <%.*s>" , |
311 | (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name)); |
312 | } |
313 | GRN_OBJ_FIN(ctx, &tokens); |
314 | } |
315 | #undef MODE_NAME_EQUAL |
316 | |
317 | exit: |
318 | grn_obj_unlink(ctx, lexicon); |
319 | if (index_column) { |
320 | grn_obj_unlink(ctx, index_column); |
321 | } |
322 | } |
323 | |
324 | return NULL; |
325 | } |
326 | |
327 | void |
328 | grn_proc_init_table_tokenize(grn_ctx *ctx) |
329 | { |
330 | grn_expr_var vars[5]; |
331 | |
332 | grn_plugin_expr_var_init(ctx, &(vars[0]), "table" , -1); |
333 | grn_plugin_expr_var_init(ctx, &(vars[1]), "string" , -1); |
334 | grn_plugin_expr_var_init(ctx, &(vars[2]), "flags" , -1); |
335 | grn_plugin_expr_var_init(ctx, &(vars[3]), "mode" , -1); |
336 | grn_plugin_expr_var_init(ctx, &(vars[4]), "index_column" , -1); |
337 | grn_plugin_command_create(ctx, |
338 | "table_tokenize" , -1, |
339 | command_table_tokenize, |
340 | 5, |
341 | vars); |
342 | } |
343 | |
344 | static grn_obj * |
345 | command_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
346 | { |
347 | grn_obj *tokenizer_name; |
348 | grn_obj *string; |
349 | grn_obj *normalizer_name; |
350 | grn_obj *flag_names; |
351 | grn_obj *mode_name; |
352 | grn_obj *token_filter_names; |
353 | |
354 | tokenizer_name = grn_plugin_proc_get_var(ctx, user_data, "tokenizer" , -1); |
355 | string = grn_plugin_proc_get_var(ctx, user_data, "string" , -1); |
356 | normalizer_name = grn_plugin_proc_get_var(ctx, user_data, "normalizer" , -1); |
357 | flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags" , -1); |
358 | mode_name = grn_plugin_proc_get_var(ctx, user_data, "mode" , -1); |
359 | token_filter_names = grn_plugin_proc_get_var(ctx, user_data, "token_filters" , -1); |
360 | |
361 | if (GRN_TEXT_LEN(tokenizer_name) == 0) { |
362 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] tokenizer name is missing" ); |
363 | return NULL; |
364 | } |
365 | |
366 | if (GRN_TEXT_LEN(string) == 0) { |
367 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] string is missing" ); |
368 | return NULL; |
369 | } |
370 | |
371 | { |
372 | unsigned int flags; |
373 | grn_obj *lexicon; |
374 | |
375 | flags = parse_tokenize_flags(ctx, flag_names); |
376 | if (ctx->rc != GRN_SUCCESS) { |
377 | return NULL; |
378 | } |
379 | |
380 | lexicon = create_lexicon_for_tokenize(ctx, |
381 | tokenizer_name, |
382 | normalizer_name, |
383 | token_filter_names); |
384 | if (!lexicon) { |
385 | return NULL; |
386 | } |
387 | #define MODE_NAME_EQUAL(name)\ |
388 | (GRN_TEXT_LEN(mode_name) == strlen(name) &&\ |
389 | memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0) |
390 | |
391 | { |
392 | grn_obj tokens; |
393 | GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); |
394 | if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("ADD" )) { |
395 | tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); |
396 | output_tokens(ctx, &tokens, lexicon, NULL); |
397 | } else if (MODE_NAME_EQUAL("GET" )) { |
398 | tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens); |
399 | GRN_BULK_REWIND(&tokens); |
400 | tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens); |
401 | output_tokens(ctx, &tokens, lexicon, NULL); |
402 | } else { |
403 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, |
404 | "[tokenize] invalid mode: <%.*s>" , |
405 | (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name)); |
406 | } |
407 | GRN_OBJ_FIN(ctx, &tokens); |
408 | } |
409 | #undef MODE_NAME_EQUAL |
410 | |
411 | grn_obj_unlink(ctx, lexicon); |
412 | } |
413 | |
414 | return NULL; |
415 | } |
416 | |
417 | void |
418 | grn_proc_init_tokenize(grn_ctx *ctx) |
419 | { |
420 | grn_expr_var vars[6]; |
421 | |
422 | grn_plugin_expr_var_init(ctx, &(vars[0]), "tokenizer" , -1); |
423 | grn_plugin_expr_var_init(ctx, &(vars[1]), "string" , -1); |
424 | grn_plugin_expr_var_init(ctx, &(vars[2]), "normalizer" , -1); |
425 | grn_plugin_expr_var_init(ctx, &(vars[3]), "flags" , -1); |
426 | grn_plugin_expr_var_init(ctx, &(vars[4]), "mode" , -1); |
427 | grn_plugin_expr_var_init(ctx, &(vars[5]), "token_filters" , -1); |
428 | grn_plugin_command_create(ctx, |
429 | "tokenize" , -1, |
430 | command_tokenize, |
431 | 6, |
432 | vars); |
433 | } |
434 | |