1/* -*- c-basic-offset: 2 -*- */
2/*
3 Copyright(C) 2009-2016 Brazil
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License version 2.1 as published by the Free Software Foundation.
8
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public
15 License along with this library; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18
19#include "../grn_proc.h"
20#include "../grn_ctx.h"
21#include "../grn_token_cursor.h"
22
23#include <groonga/plugin.h>
24
25static unsigned int
26parse_tokenize_flags(grn_ctx *ctx, grn_obj *flag_names)
27{
28 unsigned int flags = 0;
29 const char *names, *names_end;
30 int length;
31
32 names = GRN_TEXT_VALUE(flag_names);
33 length = GRN_TEXT_LEN(flag_names);
34 names_end = names + length;
35 while (names < names_end) {
36 if (*names == '|' || *names == ' ') {
37 names += 1;
38 continue;
39 }
40
41#define CHECK_FLAG(name)\
42 if (((names_end - names) >= (sizeof(#name) - 1)) &&\
43 (!memcmp(names, #name, sizeof(#name) - 1))) {\
44 flags |= GRN_TOKEN_CURSOR_ ## name;\
45 names += sizeof(#name) - 1;\
46 continue;\
47 }
48
49 CHECK_FLAG(ENABLE_TOKENIZED_DELIMITER);
50
51#define GRN_TOKEN_CURSOR_NONE 0
52 CHECK_FLAG(NONE);
53#undef GRN_TOKEN_CURSOR_NONE
54
55 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
56 "[tokenize] invalid flag: <%.*s>",
57 (int)(names_end - names), names);
58 return 0;
59#undef CHECK_FLAG
60 }
61
62 return flags;
63}
64
65typedef struct {
66 grn_id id;
67 int32_t position;
68 grn_bool force_prefix;
69} tokenize_token;
70
71static void
72output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_obj *lexicon, grn_obj *index_column)
73{
74 int i, n_tokens, n_elements;
75 grn_obj estimated_size;
76
77 n_tokens = GRN_BULK_VSIZE(tokens) / sizeof(tokenize_token);
78 n_elements = 3;
79 if (index_column) {
80 n_elements++;
81 GRN_UINT32_INIT(&estimated_size, 0);
82 }
83
84 grn_ctx_output_array_open(ctx, "TOKENS", n_tokens);
85 for (i = 0; i < n_tokens; i++) {
86 tokenize_token *token;
87 char value[GRN_TABLE_MAX_KEY_SIZE];
88 unsigned int value_size;
89
90 token = ((tokenize_token *)(GRN_BULK_HEAD(tokens))) + i;
91
92 grn_ctx_output_map_open(ctx, "TOKEN", n_elements);
93
94 grn_ctx_output_cstr(ctx, "value");
95 value_size = grn_table_get_key(ctx, lexicon, token->id,
96 value, GRN_TABLE_MAX_KEY_SIZE);
97 grn_ctx_output_str(ctx, value, value_size);
98
99 grn_ctx_output_cstr(ctx, "position");
100 grn_ctx_output_int32(ctx, token->position);
101
102 grn_ctx_output_cstr(ctx, "force_prefix");
103 grn_ctx_output_bool(ctx, token->force_prefix);
104
105 if (index_column) {
106 GRN_BULK_REWIND(&estimated_size);
107 grn_obj_get_value(ctx, index_column, token->id, &estimated_size);
108 grn_ctx_output_cstr(ctx, "estimated_size");
109 grn_ctx_output_int64(ctx, GRN_UINT32_VALUE(&estimated_size));
110 }
111
112 grn_ctx_output_map_close(ctx);
113 }
114
115 if (index_column) {
116 GRN_OBJ_FIN(ctx, &estimated_size);
117 }
118
119 grn_ctx_output_array_close(ctx);
120}
121
122static grn_obj *
123create_lexicon_for_tokenize(grn_ctx *ctx,
124 grn_obj *tokenizer_name,
125 grn_obj *normalizer_name,
126 grn_obj *token_filter_names)
127{
128 grn_obj *lexicon;
129 grn_obj *tokenizer;
130 grn_obj *normalizer = NULL;
131
132 tokenizer = grn_ctx_get(ctx,
133 GRN_TEXT_VALUE(tokenizer_name),
134 GRN_TEXT_LEN(tokenizer_name));
135 if (!tokenizer) {
136 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
137 "[tokenize] nonexistent tokenizer: <%.*s>",
138 (int)GRN_TEXT_LEN(tokenizer_name),
139 GRN_TEXT_VALUE(tokenizer_name));
140 return NULL;
141 }
142
143 if (!grn_obj_is_tokenizer_proc(ctx, tokenizer)) {
144 grn_obj inspected;
145 GRN_TEXT_INIT(&inspected, 0);
146 grn_inspect(ctx, &inspected, tokenizer);
147 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
148 "[tokenize] not tokenizer: %.*s",
149 (int)GRN_TEXT_LEN(&inspected),
150 GRN_TEXT_VALUE(&inspected));
151 GRN_OBJ_FIN(ctx, &inspected);
152 grn_obj_unlink(ctx, tokenizer);
153 return NULL;
154 }
155
156 if (GRN_TEXT_LEN(normalizer_name) > 0) {
157 normalizer = grn_ctx_get(ctx,
158 GRN_TEXT_VALUE(normalizer_name),
159 GRN_TEXT_LEN(normalizer_name));
160 if (!normalizer) {
161 grn_obj_unlink(ctx, tokenizer);
162 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
163 "[tokenize] nonexistent normalizer: <%.*s>",
164 (int)GRN_TEXT_LEN(normalizer_name),
165 GRN_TEXT_VALUE(normalizer_name));
166 return NULL;
167 }
168
169 if (!grn_obj_is_normalizer_proc(ctx, normalizer)) {
170 grn_obj inspected;
171 grn_obj_unlink(ctx, tokenizer);
172 GRN_TEXT_INIT(&inspected, 0);
173 grn_inspect(ctx, &inspected, normalizer);
174 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
175 "[tokenize] not normalizer: %.*s",
176 (int)GRN_TEXT_LEN(&inspected),
177 GRN_TEXT_VALUE(&inspected));
178 GRN_OBJ_FIN(ctx, &inspected);
179 grn_obj_unlink(ctx, normalizer);
180 return NULL;
181 }
182 }
183
184 lexicon = grn_table_create(ctx, NULL, 0,
185 NULL,
186 GRN_OBJ_TABLE_HASH_KEY,
187 grn_ctx_at(ctx, GRN_DB_SHORT_TEXT),
188 NULL);
189 grn_obj_set_info(ctx, lexicon,
190 GRN_INFO_DEFAULT_TOKENIZER, tokenizer);
191 grn_obj_unlink(ctx, tokenizer);
192 if (normalizer) {
193 grn_obj_set_info(ctx, lexicon,
194 GRN_INFO_NORMALIZER, normalizer);
195 grn_obj_unlink(ctx, normalizer);
196 }
197 grn_proc_table_set_token_filters(ctx, lexicon, token_filter_names);
198
199 return lexicon;
200}
201
202static void
203tokenize(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, grn_tokenize_mode mode,
204 unsigned int flags, grn_obj *tokens)
205{
206 grn_token_cursor *token_cursor;
207
208 token_cursor =
209 grn_token_cursor_open(ctx, lexicon,
210 GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string),
211 mode, flags);
212 if (!token_cursor) {
213 return;
214 }
215
216 while (token_cursor->status == GRN_TOKEN_CURSOR_DOING) {
217 grn_id token_id = grn_token_cursor_next(ctx, token_cursor);
218 tokenize_token *current_token;
219 if (token_id == GRN_ID_NIL) {
220 continue;
221 }
222 grn_bulk_space(ctx, tokens, sizeof(tokenize_token));
223 current_token = ((tokenize_token *)(GRN_BULK_CURR(tokens))) - 1;
224 current_token->id = token_id;
225 current_token->position = token_cursor->pos;
226 current_token->force_prefix = token_cursor->force_prefix;
227 }
228 grn_token_cursor_close(ctx, token_cursor);
229}
230
231static grn_obj *
232command_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
233{
234 grn_obj *table_name;
235 grn_obj *string;
236 grn_obj *flag_names;
237 grn_obj *mode_name;
238 grn_obj *index_column_name;
239
240 table_name = grn_plugin_proc_get_var(ctx, user_data, "table", -1);
241 string = grn_plugin_proc_get_var(ctx, user_data, "string", -1);
242 flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags", -1);
243 mode_name = grn_plugin_proc_get_var(ctx, user_data, "mode", -1);
244 index_column_name = grn_plugin_proc_get_var(ctx, user_data, "index_column", -1);
245
246 if (GRN_TEXT_LEN(table_name) == 0) {
247 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] table name is missing");
248 return NULL;
249 }
250
251 if (GRN_TEXT_LEN(string) == 0) {
252 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] string is missing");
253 return NULL;
254 }
255
256 {
257 unsigned int flags;
258 grn_obj *lexicon;
259 grn_obj *index_column = NULL;
260
261 flags = parse_tokenize_flags(ctx, flag_names);
262 if (ctx->rc != GRN_SUCCESS) {
263 return NULL;
264 }
265
266 lexicon = grn_ctx_get(ctx, GRN_TEXT_VALUE(table_name), GRN_TEXT_LEN(table_name));
267 if (!lexicon) {
268 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
269 "[table_tokenize] nonexistent lexicon: <%.*s>",
270 (int)GRN_TEXT_LEN(table_name),
271 GRN_TEXT_VALUE(table_name));
272 return NULL;
273 }
274
275#define MODE_NAME_EQUAL(name)\
276 (GRN_TEXT_LEN(mode_name) == strlen(name) &&\
277 memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0)
278
279 if (GRN_TEXT_LEN(index_column_name) > 0) {
280 index_column = grn_obj_column(ctx, lexicon,
281 GRN_TEXT_VALUE(index_column_name),
282 GRN_TEXT_LEN(index_column_name));
283 if (!index_column) {
284 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
285 "[table_tokenize] nonexistent index column: <%.*s>",
286 (int)GRN_TEXT_LEN(index_column_name),
287 GRN_TEXT_VALUE(index_column_name));
288 goto exit;
289 }
290 if (index_column->header.type != GRN_COLUMN_INDEX) {
291 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
292 "[table_tokenize] index column must be COLUMN_INDEX: <%.*s>",
293 (int)GRN_TEXT_LEN(index_column_name),
294 GRN_TEXT_VALUE(index_column_name));
295 goto exit;
296 }
297 }
298
299 {
300 grn_obj tokens;
301 GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
302 if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("GET")) {
303 tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens);
304 output_tokens(ctx, &tokens, lexicon, index_column);
305 } else if (MODE_NAME_EQUAL("ADD")) {
306 tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
307 output_tokens(ctx, &tokens, lexicon, index_column);
308 } else {
309 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
310 "[table_tokenize] invalid mode: <%.*s>",
311 (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name));
312 }
313 GRN_OBJ_FIN(ctx, &tokens);
314 }
315#undef MODE_NAME_EQUAL
316
317exit:
318 grn_obj_unlink(ctx, lexicon);
319 if (index_column) {
320 grn_obj_unlink(ctx, index_column);
321 }
322 }
323
324 return NULL;
325}
326
327void
328grn_proc_init_table_tokenize(grn_ctx *ctx)
329{
330 grn_expr_var vars[5];
331
332 grn_plugin_expr_var_init(ctx, &(vars[0]), "table", -1);
333 grn_plugin_expr_var_init(ctx, &(vars[1]), "string", -1);
334 grn_plugin_expr_var_init(ctx, &(vars[2]), "flags", -1);
335 grn_plugin_expr_var_init(ctx, &(vars[3]), "mode", -1);
336 grn_plugin_expr_var_init(ctx, &(vars[4]), "index_column", -1);
337 grn_plugin_command_create(ctx,
338 "table_tokenize", -1,
339 command_table_tokenize,
340 5,
341 vars);
342}
343
344static grn_obj *
345command_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
346{
347 grn_obj *tokenizer_name;
348 grn_obj *string;
349 grn_obj *normalizer_name;
350 grn_obj *flag_names;
351 grn_obj *mode_name;
352 grn_obj *token_filter_names;
353
354 tokenizer_name = grn_plugin_proc_get_var(ctx, user_data, "tokenizer", -1);
355 string = grn_plugin_proc_get_var(ctx, user_data, "string", -1);
356 normalizer_name = grn_plugin_proc_get_var(ctx, user_data, "normalizer", -1);
357 flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags", -1);
358 mode_name = grn_plugin_proc_get_var(ctx, user_data, "mode", -1);
359 token_filter_names = grn_plugin_proc_get_var(ctx, user_data, "token_filters", -1);
360
361 if (GRN_TEXT_LEN(tokenizer_name) == 0) {
362 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] tokenizer name is missing");
363 return NULL;
364 }
365
366 if (GRN_TEXT_LEN(string) == 0) {
367 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] string is missing");
368 return NULL;
369 }
370
371 {
372 unsigned int flags;
373 grn_obj *lexicon;
374
375 flags = parse_tokenize_flags(ctx, flag_names);
376 if (ctx->rc != GRN_SUCCESS) {
377 return NULL;
378 }
379
380 lexicon = create_lexicon_for_tokenize(ctx,
381 tokenizer_name,
382 normalizer_name,
383 token_filter_names);
384 if (!lexicon) {
385 return NULL;
386 }
387#define MODE_NAME_EQUAL(name)\
388 (GRN_TEXT_LEN(mode_name) == strlen(name) &&\
389 memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0)
390
391 {
392 grn_obj tokens;
393 GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
394 if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("ADD")) {
395 tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
396 output_tokens(ctx, &tokens, lexicon, NULL);
397 } else if (MODE_NAME_EQUAL("GET")) {
398 tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
399 GRN_BULK_REWIND(&tokens);
400 tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens);
401 output_tokens(ctx, &tokens, lexicon, NULL);
402 } else {
403 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
404 "[tokenize] invalid mode: <%.*s>",
405 (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name));
406 }
407 GRN_OBJ_FIN(ctx, &tokens);
408 }
409#undef MODE_NAME_EQUAL
410
411 grn_obj_unlink(ctx, lexicon);
412 }
413
414 return NULL;
415}
416
417void
418grn_proc_init_tokenize(grn_ctx *ctx)
419{
420 grn_expr_var vars[6];
421
422 grn_plugin_expr_var_init(ctx, &(vars[0]), "tokenizer", -1);
423 grn_plugin_expr_var_init(ctx, &(vars[1]), "string", -1);
424 grn_plugin_expr_var_init(ctx, &(vars[2]), "normalizer", -1);
425 grn_plugin_expr_var_init(ctx, &(vars[3]), "flags", -1);
426 grn_plugin_expr_var_init(ctx, &(vars[4]), "mode", -1);
427 grn_plugin_expr_var_init(ctx, &(vars[5]), "token_filters", -1);
428 grn_plugin_command_create(ctx,
429 "tokenize", -1,
430 command_tokenize,
431 6,
432 vars);
433}
434