1 | /* -*- c-basic-offset: 2 -*- */ |
2 | /* |
3 | Copyright(C) 2009-2017 Brazil |
4 | |
5 | This library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License version 2.1 as published by the Free Software Foundation. |
8 | |
9 | This library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Lesser General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU Lesser General Public |
15 | License along with this library; if not, write to the Free Software |
16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
17 | */ |
18 | #include <string.h> |
19 | #include "grn_token_cursor.h" |
20 | #include "grn_string.h" |
21 | #include "grn_plugin.h" |
22 | #include <groonga/tokenizer.h> |
23 | |
24 | grn_obj *grn_tokenizer_uvector = NULL; |
25 | |
26 | typedef struct { |
27 | grn_tokenizer_token token; |
28 | byte *curr; |
29 | byte *tail; |
30 | uint32_t unit; |
31 | } grn_uvector_tokenizer; |
32 | |
33 | static grn_obj * |
34 | uvector_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
35 | { |
36 | grn_obj *str, *flags, *mode; |
37 | grn_uvector_tokenizer *tokenizer; |
38 | if (!(flags = grn_ctx_pop(ctx))) { |
39 | ERR(GRN_INVALID_ARGUMENT, "[tokenizer][uvector] missing argument: flags" ); |
40 | return NULL; |
41 | } |
42 | if (!(str = grn_ctx_pop(ctx))) { |
43 | ERR(GRN_INVALID_ARGUMENT, "[tokenizer][uvector] missing argument: string" ); |
44 | return NULL; |
45 | } |
46 | if (!(mode = grn_ctx_pop(ctx))) { |
47 | ERR(GRN_INVALID_ARGUMENT, "[tokenizer][uvector] missing argument: mode" ); |
48 | return NULL; |
49 | } |
50 | if (!(tokenizer = GRN_MALLOC(sizeof(grn_uvector_tokenizer)))) { |
51 | ERR(GRN_NO_MEMORY_AVAILABLE, |
52 | "[tokenizer][uvector] " |
53 | "memory allocation to grn_uvector_tokenizer failed" ); |
54 | return NULL; |
55 | } |
56 | user_data->ptr = tokenizer; |
57 | |
58 | grn_tokenizer_token_init(ctx, &(tokenizer->token)); |
59 | tokenizer->curr = (byte *)GRN_TEXT_VALUE(str); |
60 | tokenizer->tail = tokenizer->curr + GRN_TEXT_LEN(str); |
61 | tokenizer->unit = sizeof(grn_id); |
62 | return NULL; |
63 | } |
64 | |
65 | static grn_obj * |
66 | uvector_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
67 | { |
68 | grn_uvector_tokenizer *tokenizer = user_data->ptr; |
69 | byte *p = tokenizer->curr + tokenizer->unit; |
70 | if (tokenizer->tail < p) { |
71 | grn_tokenizer_token_push(ctx, &(tokenizer->token), |
72 | (const char *)tokenizer->curr, 0, |
73 | GRN_TOKEN_LAST); |
74 | } else { |
75 | grn_token_status status; |
76 | if (tokenizer->tail == p) { |
77 | status = GRN_TOKEN_LAST; |
78 | } else { |
79 | status = GRN_TOKEN_CONTINUE; |
80 | } |
81 | grn_tokenizer_token_push(ctx, &(tokenizer->token), |
82 | (const char *)tokenizer->curr, tokenizer->unit, |
83 | status); |
84 | tokenizer->curr = p; |
85 | } |
86 | return NULL; |
87 | } |
88 | |
89 | static grn_obj * |
90 | uvector_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
91 | { |
92 | grn_uvector_tokenizer *tokenizer = user_data->ptr; |
93 | if (!tokenizer) { |
94 | return NULL; |
95 | } |
96 | grn_tokenizer_token_fin(ctx, &(tokenizer->token)); |
97 | GRN_FREE(tokenizer); |
98 | return NULL; |
99 | } |
100 | |
101 | typedef struct { |
102 | const uint8_t *delimiter; |
103 | uint32_t delimiter_len; |
104 | const unsigned char *next; |
105 | const unsigned char *end; |
106 | grn_tokenizer_token token; |
107 | grn_tokenizer_query *query; |
108 | grn_bool have_tokenized_delimiter; |
109 | } grn_delimited_tokenizer; |
110 | |
111 | static grn_obj * |
112 | delimited_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data, |
113 | const uint8_t *delimiter, uint32_t delimiter_len) |
114 | { |
115 | grn_tokenizer_query *query; |
116 | unsigned int normalize_flags = 0; |
117 | const char *normalized; |
118 | unsigned int normalized_length_in_bytes; |
119 | grn_delimited_tokenizer *tokenizer; |
120 | |
121 | query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags); |
122 | if (!query) { |
123 | return NULL; |
124 | } |
125 | |
126 | if (!(tokenizer = GRN_MALLOC(sizeof(grn_delimited_tokenizer)))) { |
127 | ERR(GRN_NO_MEMORY_AVAILABLE, |
128 | "[tokenizer][delimit] " |
129 | "memory allocation to grn_delimited_tokenizer failed" ); |
130 | grn_tokenizer_query_close(ctx, query); |
131 | return NULL; |
132 | } |
133 | user_data->ptr = tokenizer; |
134 | |
135 | tokenizer->query = query; |
136 | |
137 | tokenizer->have_tokenized_delimiter = |
138 | grn_tokenizer_have_tokenized_delimiter(ctx, |
139 | tokenizer->query->ptr, |
140 | tokenizer->query->length, |
141 | tokenizer->query->encoding); |
142 | tokenizer->delimiter = delimiter; |
143 | tokenizer->delimiter_len = delimiter_len; |
144 | grn_string_get_normalized(ctx, tokenizer->query->normalized_query, |
145 | &normalized, &normalized_length_in_bytes, |
146 | NULL); |
147 | tokenizer->next = (const unsigned char *)normalized; |
148 | tokenizer->end = tokenizer->next + normalized_length_in_bytes; |
149 | |
150 | grn_tokenizer_token_init(ctx, &(tokenizer->token)); |
151 | |
152 | return NULL; |
153 | } |
154 | |
155 | static grn_obj * |
156 | delimited_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
157 | { |
158 | grn_delimited_tokenizer *tokenizer = user_data->ptr; |
159 | |
160 | if (tokenizer->have_tokenized_delimiter) { |
161 | unsigned int rest_length; |
162 | rest_length = tokenizer->end - tokenizer->next; |
163 | tokenizer->next = |
164 | (unsigned char *)grn_tokenizer_tokenized_delimiter_next( |
165 | ctx, |
166 | &(tokenizer->token), |
167 | (const char *)tokenizer->next, |
168 | rest_length, |
169 | tokenizer->query->encoding); |
170 | } else { |
171 | size_t cl; |
172 | const unsigned char *p = tokenizer->next, *r; |
173 | const unsigned char *e = tokenizer->end; |
174 | grn_token_status status; |
175 | for (r = p; r < e; r += cl) { |
176 | if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e, |
177 | tokenizer->query->encoding))) { |
178 | tokenizer->next = (unsigned char *)e; |
179 | break; |
180 | } |
181 | { |
182 | grn_bool found_delimiter = GRN_FALSE; |
183 | const unsigned char *current_end = r; |
184 | while (current_end + tokenizer->delimiter_len <= e && |
185 | !memcmp(current_end, |
186 | tokenizer->delimiter, tokenizer->delimiter_len)) { |
187 | current_end += tokenizer->delimiter_len; |
188 | tokenizer->next = current_end; |
189 | found_delimiter = GRN_TRUE; |
190 | } |
191 | if (found_delimiter) { |
192 | break; |
193 | } |
194 | } |
195 | } |
196 | if (r == e) { |
197 | status = GRN_TOKEN_LAST; |
198 | } else { |
199 | status = GRN_TOKEN_CONTINUE; |
200 | } |
201 | grn_tokenizer_token_push(ctx, |
202 | &(tokenizer->token), |
203 | (const char *)p, |
204 | r - p, |
205 | status); |
206 | } |
207 | |
208 | return NULL; |
209 | } |
210 | |
211 | static grn_obj * |
212 | delimited_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
213 | { |
214 | grn_delimited_tokenizer *tokenizer = user_data->ptr; |
215 | if (!tokenizer) { |
216 | return NULL; |
217 | } |
218 | grn_tokenizer_query_close(ctx, tokenizer->query); |
219 | grn_tokenizer_token_fin(ctx, &(tokenizer->token)); |
220 | GRN_FREE(tokenizer); |
221 | return NULL; |
222 | } |
223 | |
224 | static grn_obj * |
225 | delimit_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
226 | { |
227 | static const uint8_t delimiter[1] = {' '}; |
228 | return delimited_init(ctx, nargs, args, user_data, delimiter, 1); |
229 | } |
230 | |
231 | static grn_obj * |
232 | delimit_null_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
233 | { |
234 | static const uint8_t delimiter[1] = {'\0'}; |
235 | return delimited_init(ctx, nargs, args, user_data, delimiter, 1); |
236 | } |
237 | |
238 | /* ngram tokenizer */ |
239 | |
240 | static grn_bool grn_ngram_tokenizer_remove_blank_disable = GRN_FALSE; |
241 | |
242 | typedef struct { |
243 | grn_tokenizer_token token; |
244 | grn_tokenizer_query *query; |
245 | uint8_t uni_alpha; |
246 | uint8_t uni_digit; |
247 | uint8_t uni_symbol; |
248 | uint8_t ngram_unit; |
249 | uint8_t ignore_blank; |
250 | uint8_t overlap; |
251 | int32_t pos; |
252 | uint32_t skip; |
253 | const unsigned char *next; |
254 | const unsigned char *end; |
255 | const uint_least8_t *ctypes; |
256 | uint32_t len; |
257 | uint32_t tail; |
258 | } grn_ngram_tokenizer; |
259 | |
260 | static grn_obj * |
261 | ngram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data, uint8_t ngram_unit, |
262 | uint8_t uni_alpha, uint8_t uni_digit, uint8_t uni_symbol, uint8_t ignore_blank) |
263 | { |
264 | unsigned int normalize_flags = |
265 | GRN_STRING_REMOVE_BLANK | |
266 | GRN_STRING_WITH_TYPES | |
267 | GRN_STRING_REMOVE_TOKENIZED_DELIMITER; |
268 | grn_tokenizer_query *query; |
269 | const char *normalized; |
270 | unsigned int normalized_length_in_bytes; |
271 | grn_ngram_tokenizer *tokenizer; |
272 | |
273 | if (grn_ngram_tokenizer_remove_blank_disable) { |
274 | normalize_flags &= ~GRN_STRING_REMOVE_BLANK; |
275 | } |
276 | query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags); |
277 | if (!query) { |
278 | return NULL; |
279 | } |
280 | |
281 | if (!(tokenizer = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) { |
282 | grn_tokenizer_query_close(ctx, query); |
283 | ERR(GRN_NO_MEMORY_AVAILABLE, |
284 | "[tokenizer][ngram] " |
285 | "memory allocation to grn_ngram_tokenizer failed" ); |
286 | return NULL; |
287 | } |
288 | user_data->ptr = tokenizer; |
289 | |
290 | grn_tokenizer_token_init(ctx, &(tokenizer->token)); |
291 | tokenizer->query = query; |
292 | |
293 | tokenizer->uni_alpha = uni_alpha; |
294 | tokenizer->uni_digit = uni_digit; |
295 | tokenizer->uni_symbol = uni_symbol; |
296 | tokenizer->ngram_unit = ngram_unit; |
297 | tokenizer->ignore_blank = ignore_blank; |
298 | tokenizer->overlap = 0; |
299 | tokenizer->pos = 0; |
300 | tokenizer->skip = 0; |
301 | |
302 | grn_string_get_normalized(ctx, tokenizer->query->normalized_query, |
303 | &normalized, &normalized_length_in_bytes, |
304 | &(tokenizer->len)); |
305 | tokenizer->next = (const unsigned char *)normalized; |
306 | tokenizer->end = tokenizer->next + normalized_length_in_bytes; |
307 | tokenizer->ctypes = |
308 | grn_string_get_types(ctx, tokenizer->query->normalized_query); |
309 | return NULL; |
310 | } |
311 | |
312 | static grn_obj * |
313 | unigram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
314 | { return ngram_init(ctx, nargs, args, user_data, 1, 1, 1, 1, 0); } |
315 | |
316 | static grn_obj * |
317 | bigram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
318 | { return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 1, 0); } |
319 | |
320 | static grn_obj * |
321 | trigram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
322 | { return ngram_init(ctx, nargs, args, user_data, 3, 1, 1, 1, 0); } |
323 | |
324 | static grn_obj * |
325 | bigrams_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
326 | { return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 0, 0); } |
327 | |
328 | static grn_obj * |
329 | bigramsa_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
330 | { return ngram_init(ctx, nargs, args, user_data, 2, 0, 1, 0, 0); } |
331 | |
332 | static grn_obj * |
333 | bigramsad_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
334 | { return ngram_init(ctx, nargs, args, user_data, 2, 0, 0, 0, 0); } |
335 | |
336 | static grn_obj * |
337 | bigrami_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
338 | { return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 1, 1); } |
339 | |
340 | static grn_obj * |
341 | bigramis_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
342 | { return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 0, 1); } |
343 | |
344 | static grn_obj * |
345 | bigramisa_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
346 | { return ngram_init(ctx, nargs, args, user_data, 2, 0, 1, 0, 1); } |
347 | |
348 | static grn_obj * |
349 | bigramisad_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
350 | { return ngram_init(ctx, nargs, args, user_data, 2, 0, 0, 0, 1); } |
351 | |
352 | static grn_obj * |
353 | ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
354 | { |
355 | size_t cl; |
356 | grn_ngram_tokenizer *tokenizer = user_data->ptr; |
357 | const unsigned char *p = tokenizer->next, *r = p, *e = tokenizer->end; |
358 | int32_t len = 0, pos = tokenizer->pos + tokenizer->skip; |
359 | grn_token_status status = 0; |
360 | const uint_least8_t *cp = tokenizer->ctypes ? tokenizer->ctypes + pos : NULL; |
361 | if (cp && tokenizer->uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) { |
362 | while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, |
363 | tokenizer->query->encoding))) { |
364 | len++; |
365 | r += cl; |
366 | if (/* !tokenizer->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; } |
367 | if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_ALPHA) { break; } |
368 | } |
369 | tokenizer->next = r; |
370 | tokenizer->overlap = 0; |
371 | } else if (cp && |
372 | tokenizer->uni_digit && |
373 | GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) { |
374 | while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, |
375 | tokenizer->query->encoding))) { |
376 | len++; |
377 | r += cl; |
378 | if (/* !tokenizer->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; } |
379 | if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_DIGIT) { break; } |
380 | } |
381 | tokenizer->next = r; |
382 | tokenizer->overlap = 0; |
383 | } else if (cp && |
384 | tokenizer->uni_symbol && |
385 | GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) { |
386 | while ((cl = grn_charlen_(ctx, (char *)r, (char *)e, |
387 | tokenizer->query->encoding))) { |
388 | len++; |
389 | r += cl; |
390 | if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; } |
391 | if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_SYMBOL) { break; } |
392 | } |
393 | tokenizer->next = r; |
394 | tokenizer->overlap = 0; |
395 | } else { |
396 | #ifdef PRE_DEFINED_UNSPLIT_WORDS |
397 | const unsigned char *key = NULL; |
398 | // todo : grn_pat_lcp_search |
399 | if ((tid = grn_sym_common_prefix_search(sym, p))) { |
400 | if (!(key = _grn_sym_key(sym, tid))) { |
401 | tokenizer->status = GRN_TOKEN_CURSOR_NOT_FOUND; |
402 | return NULL; |
403 | } |
404 | len = grn_str_len(key, tokenizer->query->encoding, NULL); |
405 | } |
406 | r = p + grn_charlen_(ctx, p, e, tokenizer->query->encoding); |
407 | if (tid && (len > 1 || r == p)) { |
408 | if (r != p && pos + len - 1 <= tokenizer->tail) { continue; } |
409 | p += strlen(key); |
410 | if (!*p && tokenizer->mode == GRN_TOKEN_GET) { |
411 | tokenizer->status = GRN_TOKEN_CURSOR_DONE; |
412 | } |
413 | } |
414 | #endif /* PRE_DEFINED_UNSPLIT_WORDS */ |
415 | if ((cl = grn_charlen_(ctx, (char *)r, (char *)e, |
416 | tokenizer->query->encoding))) { |
417 | len++; |
418 | r += cl; |
419 | tokenizer->next = r; |
420 | while (len < tokenizer->ngram_unit && |
421 | (cl = grn_charlen_(ctx, (char *)r, (char *)e, |
422 | tokenizer->query->encoding))) { |
423 | if (cp) { |
424 | if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; } |
425 | cp++; |
426 | if ((tokenizer->uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) || |
427 | (tokenizer->uni_digit && GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) || |
428 | (tokenizer->uni_symbol && GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL)) { |
429 | break; |
430 | } |
431 | } |
432 | len++; |
433 | r += cl; |
434 | } |
435 | if (tokenizer->overlap) { |
436 | status |= GRN_TOKEN_OVERLAP; |
437 | } |
438 | if (len < tokenizer->ngram_unit) { |
439 | status |= GRN_TOKEN_UNMATURED; |
440 | } |
441 | tokenizer->overlap = (len > 1) ? 1 : 0; |
442 | } |
443 | } |
444 | tokenizer->pos = pos; |
445 | tokenizer->len = len; |
446 | tokenizer->tail = pos + len - 1; |
447 | if (p == r || tokenizer->next == e) { |
448 | tokenizer->skip = 0; |
449 | status |= GRN_TOKEN_LAST; |
450 | } else { |
451 | tokenizer->skip = tokenizer->overlap ? 1 : len; |
452 | } |
453 | if (r == e) { status |= GRN_TOKEN_REACH_END; } |
454 | grn_tokenizer_token_push(ctx, |
455 | &(tokenizer->token), |
456 | (const char *)p, |
457 | r - p, |
458 | status); |
459 | return NULL; |
460 | } |
461 | |
462 | static grn_obj * |
463 | ngram_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
464 | { |
465 | grn_ngram_tokenizer *tokenizer = user_data->ptr; |
466 | if (!tokenizer) { |
467 | return NULL; |
468 | } |
469 | grn_tokenizer_token_fin(ctx, &(tokenizer->token)); |
470 | grn_tokenizer_query_close(ctx, tokenizer->query); |
471 | GRN_FREE(tokenizer); |
472 | return NULL; |
473 | } |
474 | |
475 | /* regexp tokenizer */ |
476 | |
477 | typedef struct { |
478 | grn_tokenizer_token token; |
479 | grn_tokenizer_query *query; |
480 | struct { |
481 | int32_t n_skip_tokens; |
482 | } get; |
483 | grn_bool is_begin; |
484 | grn_bool is_end; |
485 | grn_bool is_start_token; |
486 | grn_bool is_overlapping; |
487 | const char *next; |
488 | const char *end; |
489 | unsigned int nth_char; |
490 | const uint_least8_t *char_types; |
491 | grn_obj buffer; |
492 | } grn_regexp_tokenizer; |
493 | |
494 | static grn_obj * |
495 | regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
496 | { |
497 | unsigned int normalize_flags = GRN_STRING_WITH_TYPES; |
498 | grn_tokenizer_query *query; |
499 | const char *normalized; |
500 | unsigned int normalized_length_in_bytes; |
501 | grn_regexp_tokenizer *tokenizer; |
502 | |
503 | query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags); |
504 | if (!query) { |
505 | return NULL; |
506 | } |
507 | |
508 | tokenizer = GRN_MALLOC(sizeof(grn_regexp_tokenizer)); |
509 | if (!tokenizer) { |
510 | grn_tokenizer_query_close(ctx, query); |
511 | ERR(GRN_NO_MEMORY_AVAILABLE, |
512 | "[tokenizer][regexp] failed to allocate memory" ); |
513 | return NULL; |
514 | } |
515 | user_data->ptr = tokenizer; |
516 | |
517 | grn_tokenizer_token_init(ctx, &(tokenizer->token)); |
518 | tokenizer->query = query; |
519 | |
520 | tokenizer->get.n_skip_tokens = 0; |
521 | |
522 | tokenizer->is_begin = GRN_TRUE; |
523 | tokenizer->is_end = GRN_FALSE; |
524 | tokenizer->is_start_token = GRN_TRUE; |
525 | tokenizer->is_overlapping = GRN_FALSE; |
526 | |
527 | grn_string_get_normalized(ctx, tokenizer->query->normalized_query, |
528 | &normalized, &normalized_length_in_bytes, |
529 | NULL); |
530 | tokenizer->next = normalized; |
531 | tokenizer->end = tokenizer->next + normalized_length_in_bytes; |
532 | tokenizer->nth_char = 0; |
533 | tokenizer->char_types = |
534 | grn_string_get_types(ctx, tokenizer->query->normalized_query); |
535 | |
536 | GRN_TEXT_INIT(&(tokenizer->buffer), 0); |
537 | |
538 | return NULL; |
539 | } |
540 | |
541 | static grn_obj * |
542 | regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
543 | { |
544 | int char_len; |
545 | grn_token_status status = 0; |
546 | grn_regexp_tokenizer *tokenizer = user_data->ptr; |
547 | unsigned int n_characters = 0; |
548 | int ngram_unit = 2; |
549 | grn_obj *buffer = &(tokenizer->buffer); |
550 | const char *current = tokenizer->next; |
551 | const char *end = tokenizer->end; |
552 | const uint_least8_t *char_types = tokenizer->char_types; |
553 | grn_tokenize_mode mode = tokenizer->query->tokenize_mode; |
554 | grn_bool is_begin = tokenizer->is_begin; |
555 | grn_bool is_start_token = tokenizer->is_start_token; |
556 | grn_bool break_by_blank = GRN_FALSE; |
557 | grn_bool break_by_end_mark = GRN_FALSE; |
558 | |
559 | GRN_BULK_REWIND(buffer); |
560 | tokenizer->is_begin = GRN_FALSE; |
561 | tokenizer->is_start_token = GRN_FALSE; |
562 | |
563 | if (char_types) { |
564 | char_types += tokenizer->nth_char; |
565 | } |
566 | |
567 | if (mode != GRN_TOKEN_GET) { |
568 | if (is_begin) { |
569 | grn_tokenizer_token_push(ctx, |
570 | &(tokenizer->token), |
571 | GRN_TOKENIZER_BEGIN_MARK_UTF8, |
572 | GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN, |
573 | status); |
574 | return NULL; |
575 | } |
576 | |
577 | if (tokenizer->is_end) { |
578 | status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; |
579 | grn_tokenizer_token_push(ctx, |
580 | &(tokenizer->token), |
581 | GRN_TOKENIZER_END_MARK_UTF8, |
582 | GRN_TOKENIZER_END_MARK_UTF8_LEN, |
583 | status); |
584 | return NULL; |
585 | } |
586 | if (is_start_token) { |
587 | if (char_types && GRN_STR_ISBLANK(char_types[-1])) { |
588 | status |= GRN_TOKEN_SKIP; |
589 | grn_tokenizer_token_push(ctx, &(tokenizer->token), "" , 0, status); |
590 | return NULL; |
591 | } |
592 | } |
593 | } |
594 | |
595 | char_len = grn_charlen_(ctx, current, end, tokenizer->query->encoding); |
596 | if (char_len == 0) { |
597 | status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; |
598 | grn_tokenizer_token_push(ctx, &(tokenizer->token), "" , 0, status); |
599 | return NULL; |
600 | } |
601 | |
602 | if (mode == GRN_TOKEN_GET) { |
603 | if (is_begin && |
604 | char_len == GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN && |
605 | memcmp(current, GRN_TOKENIZER_BEGIN_MARK_UTF8, char_len) == 0) { |
606 | tokenizer->is_start_token = GRN_TRUE; |
607 | n_characters++; |
608 | GRN_TEXT_PUT(ctx, buffer, current, char_len); |
609 | current += char_len; |
610 | tokenizer->next = current; |
611 | tokenizer->nth_char++; |
612 | if (current == end) { |
613 | status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; |
614 | } |
615 | grn_tokenizer_token_push(ctx, |
616 | &(tokenizer->token), |
617 | GRN_TOKENIZER_BEGIN_MARK_UTF8, |
618 | GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN, |
619 | status); |
620 | return NULL; |
621 | } |
622 | |
623 | if (current + char_len == end && |
624 | char_len == GRN_TOKENIZER_END_MARK_UTF8_LEN && |
625 | memcmp(current, GRN_TOKENIZER_END_MARK_UTF8, char_len) == 0) { |
626 | status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; |
627 | grn_tokenizer_token_push(ctx, |
628 | &(tokenizer->token), |
629 | GRN_TOKENIZER_END_MARK_UTF8, |
630 | GRN_TOKENIZER_END_MARK_UTF8_LEN, |
631 | status); |
632 | return NULL; |
633 | } |
634 | } |
635 | |
636 | while (GRN_TRUE) { |
637 | n_characters++; |
638 | GRN_TEXT_PUT(ctx, buffer, current, char_len); |
639 | current += char_len; |
640 | if (n_characters == 1) { |
641 | tokenizer->next = current; |
642 | tokenizer->nth_char++; |
643 | } |
644 | |
645 | if (char_types) { |
646 | uint_least8_t char_type; |
647 | char_type = char_types[0]; |
648 | char_types++; |
649 | if (GRN_STR_ISBLANK(char_type)) { |
650 | break_by_blank = GRN_TRUE; |
651 | } |
652 | } |
653 | |
654 | char_len = grn_charlen_(ctx, (const char *)current, (const char *)end, |
655 | tokenizer->query->encoding); |
656 | if (char_len == 0) { |
657 | break; |
658 | } |
659 | |
660 | if (mode == GRN_TOKEN_GET && |
661 | current + char_len == end && |
662 | char_len == GRN_TOKENIZER_END_MARK_UTF8_LEN && |
663 | memcmp(current, GRN_TOKENIZER_END_MARK_UTF8, char_len) == 0) { |
664 | break_by_end_mark = GRN_TRUE; |
665 | } |
666 | |
667 | if (break_by_blank || break_by_end_mark) { |
668 | break; |
669 | } |
670 | |
671 | if (n_characters == ngram_unit) { |
672 | break; |
673 | } |
674 | } |
675 | |
676 | if (tokenizer->is_overlapping) { |
677 | status |= GRN_TOKEN_OVERLAP; |
678 | } |
679 | if (n_characters < ngram_unit) { |
680 | status |= GRN_TOKEN_UNMATURED; |
681 | } |
682 | tokenizer->is_overlapping = (n_characters > 1); |
683 | |
684 | if (mode == GRN_TOKEN_GET) { |
685 | if (current == end) { |
686 | tokenizer->is_end = GRN_TRUE; |
687 | status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END; |
688 | if (status & GRN_TOKEN_UNMATURED) { |
689 | status |= GRN_TOKEN_FORCE_PREFIX; |
690 | } |
691 | } else { |
692 | if (break_by_blank) { |
693 | tokenizer->get.n_skip_tokens = 0; |
694 | tokenizer->is_start_token = GRN_TRUE; |
695 | } else if (break_by_end_mark) { |
696 | if (!is_start_token && (status & GRN_TOKEN_UNMATURED)) { |
697 | status |= GRN_TOKEN_SKIP; |
698 | } |
699 | } else if (tokenizer->get.n_skip_tokens > 0) { |
700 | tokenizer->get.n_skip_tokens--; |
701 | status |= GRN_TOKEN_SKIP; |
702 | } else { |
703 | tokenizer->get.n_skip_tokens = ngram_unit - 1; |
704 | } |
705 | } |
706 | } else { |
707 | if (tokenizer->next == end) { |
708 | tokenizer->is_end = GRN_TRUE; |
709 | } |
710 | if (break_by_blank) { |
711 | tokenizer->is_start_token = GRN_TRUE; |
712 | } |
713 | } |
714 | |
715 | grn_tokenizer_token_push(ctx, |
716 | &(tokenizer->token), |
717 | GRN_TEXT_VALUE(buffer), |
718 | GRN_TEXT_LEN(buffer), |
719 | status); |
720 | |
721 | return NULL; |
722 | } |
723 | |
724 | static grn_obj * |
725 | regexp_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
726 | { |
727 | grn_regexp_tokenizer *tokenizer = user_data->ptr; |
728 | if (!tokenizer) { |
729 | return NULL; |
730 | } |
731 | grn_tokenizer_token_fin(ctx, &(tokenizer->token)); |
732 | grn_tokenizer_query_close(ctx, tokenizer->query); |
733 | GRN_OBJ_FIN(ctx, &(tokenizer->buffer)); |
734 | GRN_FREE(tokenizer); |
735 | return NULL; |
736 | } |
737 | |
738 | /* external */ |
739 | |
740 | grn_rc |
741 | grn_tokenizers_init(void) |
742 | { |
743 | static grn_proc _grn_tokenizer_uvector; |
744 | _grn_tokenizer_uvector.obj.db = NULL; |
745 | _grn_tokenizer_uvector.obj.id = GRN_ID_NIL; |
746 | _grn_tokenizer_uvector.obj.header.domain = GRN_ID_NIL; |
747 | _grn_tokenizer_uvector.obj.range = GRN_ID_NIL; |
748 | _grn_tokenizer_uvector.funcs[PROC_INIT] = uvector_init; |
749 | _grn_tokenizer_uvector.funcs[PROC_NEXT] = uvector_next; |
750 | _grn_tokenizer_uvector.funcs[PROC_FIN] = uvector_fin; |
751 | grn_tokenizer_uvector = (grn_obj *)&_grn_tokenizer_uvector; |
752 | return GRN_SUCCESS; |
753 | } |
754 | |
755 | grn_rc |
756 | grn_tokenizers_fin(void) |
757 | { |
758 | return GRN_SUCCESS; |
759 | } |
760 | |
761 | grn_rc |
762 | grn_db_init_mecab_tokenizer(grn_ctx *ctx) |
763 | { |
764 | switch (GRN_CTX_GET_ENCODING(ctx)) { |
765 | case GRN_ENC_EUC_JP : |
766 | case GRN_ENC_UTF8 : |
767 | case GRN_ENC_SJIS : |
768 | #if defined(GRN_EMBEDDED) && defined(GRN_WITH_MECAB) |
769 | { |
770 | GRN_PLUGIN_DECLARE_FUNCTIONS(tokenizers_mecab); |
771 | grn_rc rc; |
772 | rc = GRN_PLUGIN_IMPL_NAME_TAGGED(init, tokenizers_mecab)(ctx); |
773 | if (rc == GRN_SUCCESS) { |
774 | rc = GRN_PLUGIN_IMPL_NAME_TAGGED(register, tokenizers_mecab)(ctx); |
775 | if (rc != GRN_SUCCESS) { |
776 | GRN_PLUGIN_IMPL_NAME_TAGGED(fin, tokenizers_mecab)(ctx); |
777 | } |
778 | } |
779 | return rc; |
780 | } |
781 | #else /* defined(GRN_EMBEDDED) && defined(GRN_WITH_MECAB) */ |
782 | { |
783 | const char *mecab_plugin_name = "tokenizers/mecab" ; |
784 | char *path; |
785 | path = grn_plugin_find_path(ctx, mecab_plugin_name); |
786 | if (path) { |
787 | GRN_FREE(path); |
788 | return grn_plugin_register(ctx, mecab_plugin_name); |
789 | } else { |
790 | return GRN_NO_SUCH_FILE_OR_DIRECTORY; |
791 | } |
792 | } |
793 | #endif /* defined(GRN_EMBEDDED) && defined(GRN_WITH_MECAB) */ |
794 | break; |
795 | default : |
796 | return GRN_OPERATION_NOT_SUPPORTED; |
797 | } |
798 | } |
799 | |
800 | #define DEF_TOKENIZER(name, init, next, fin, vars)\ |
801 | (grn_proc_create(ctx, (name), (sizeof(name) - 1),\ |
802 | GRN_PROC_TOKENIZER, (init), (next), (fin), 3, (vars))) |
803 | |
804 | grn_rc |
805 | grn_db_init_builtin_tokenizers(grn_ctx *ctx) |
806 | { |
807 | grn_obj *obj; |
808 | grn_expr_var vars[] = { |
809 | {NULL, 0}, |
810 | {NULL, 0}, |
811 | {NULL, 0} |
812 | }; |
813 | GRN_TEXT_INIT(&vars[0].value, 0); |
814 | GRN_TEXT_INIT(&vars[1].value, 0); |
815 | GRN_UINT32_INIT(&vars[2].value, 0); |
816 | |
817 | { |
818 | char grn_ngram_tokenizer_remove_blank_disable_env[GRN_ENV_BUFFER_SIZE]; |
819 | |
820 | grn_getenv("GRN_NGRAM_TOKENIZER_REMOVE_BLANK_DISABLE" , |
821 | grn_ngram_tokenizer_remove_blank_disable_env, |
822 | GRN_ENV_BUFFER_SIZE); |
823 | if (grn_ngram_tokenizer_remove_blank_disable_env[0]) { |
824 | grn_ngram_tokenizer_remove_blank_disable = GRN_TRUE; |
825 | } |
826 | } |
827 | |
828 | obj = DEF_TOKENIZER("TokenDelimit" , |
829 | delimit_init, delimited_next, delimited_fin, vars); |
830 | if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_DELIMIT) { return GRN_FILE_CORRUPT; } |
831 | obj = DEF_TOKENIZER("TokenUnigram" , |
832 | unigram_init, ngram_next, ngram_fin, vars); |
833 | if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_UNIGRAM) { return GRN_FILE_CORRUPT; } |
834 | obj = DEF_TOKENIZER("TokenBigram" , |
835 | bigram_init, ngram_next, ngram_fin, vars); |
836 | if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_BIGRAM) { return GRN_FILE_CORRUPT; } |
837 | obj = DEF_TOKENIZER("TokenTrigram" , |
838 | trigram_init, ngram_next, ngram_fin, vars); |
839 | if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_TRIGRAM) { return GRN_FILE_CORRUPT; } |
840 | |
841 | DEF_TOKENIZER("TokenBigramSplitSymbol" , |
842 | bigrams_init, ngram_next, ngram_fin, vars); |
843 | DEF_TOKENIZER("TokenBigramSplitSymbolAlpha" , |
844 | bigramsa_init, ngram_next, ngram_fin, vars); |
845 | DEF_TOKENIZER("TokenBigramSplitSymbolAlphaDigit" , |
846 | bigramsad_init, ngram_next, ngram_fin, vars); |
847 | DEF_TOKENIZER("TokenBigramIgnoreBlank" , |
848 | bigrami_init, ngram_next, ngram_fin, vars); |
849 | DEF_TOKENIZER("TokenBigramIgnoreBlankSplitSymbol" , |
850 | bigramis_init, ngram_next, ngram_fin, vars); |
851 | DEF_TOKENIZER("TokenBigramIgnoreBlankSplitSymbolAlpha" , |
852 | bigramisa_init, ngram_next, ngram_fin, vars); |
853 | DEF_TOKENIZER("TokenBigramIgnoreBlankSplitSymbolAlphaDigit" , |
854 | bigramisad_init, ngram_next, ngram_fin, vars); |
855 | DEF_TOKENIZER("TokenDelimitNull" , |
856 | delimit_null_init, delimited_next, delimited_fin, vars); |
857 | DEF_TOKENIZER("TokenRegexp" , |
858 | regexp_init, regexp_next, regexp_fin, vars); |
859 | return GRN_SUCCESS; |
860 | } |
861 | |