1/* -*- c-basic-offset: 2 -*- */
2/*
3 Copyright(C) 2009-2017 Brazil
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License version 2.1 as published by the Free Software Foundation.
8
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public
15 License along with this library; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18#include <string.h>
19#include "grn_token_cursor.h"
20#include "grn_string.h"
21#include "grn_plugin.h"
22#include <groonga/tokenizer.h>
23
24grn_obj *grn_tokenizer_uvector = NULL;
25
26typedef struct {
27 grn_tokenizer_token token;
28 byte *curr;
29 byte *tail;
30 uint32_t unit;
31} grn_uvector_tokenizer;
32
33static grn_obj *
34uvector_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
35{
36 grn_obj *str, *flags, *mode;
37 grn_uvector_tokenizer *tokenizer;
38 if (!(flags = grn_ctx_pop(ctx))) {
39 ERR(GRN_INVALID_ARGUMENT, "[tokenizer][uvector] missing argument: flags");
40 return NULL;
41 }
42 if (!(str = grn_ctx_pop(ctx))) {
43 ERR(GRN_INVALID_ARGUMENT, "[tokenizer][uvector] missing argument: string");
44 return NULL;
45 }
46 if (!(mode = grn_ctx_pop(ctx))) {
47 ERR(GRN_INVALID_ARGUMENT, "[tokenizer][uvector] missing argument: mode");
48 return NULL;
49 }
50 if (!(tokenizer = GRN_MALLOC(sizeof(grn_uvector_tokenizer)))) {
51 ERR(GRN_NO_MEMORY_AVAILABLE,
52 "[tokenizer][uvector] "
53 "memory allocation to grn_uvector_tokenizer failed");
54 return NULL;
55 }
56 user_data->ptr = tokenizer;
57
58 grn_tokenizer_token_init(ctx, &(tokenizer->token));
59 tokenizer->curr = (byte *)GRN_TEXT_VALUE(str);
60 tokenizer->tail = tokenizer->curr + GRN_TEXT_LEN(str);
61 tokenizer->unit = sizeof(grn_id);
62 return NULL;
63}
64
65static grn_obj *
66uvector_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
67{
68 grn_uvector_tokenizer *tokenizer = user_data->ptr;
69 byte *p = tokenizer->curr + tokenizer->unit;
70 if (tokenizer->tail < p) {
71 grn_tokenizer_token_push(ctx, &(tokenizer->token),
72 (const char *)tokenizer->curr, 0,
73 GRN_TOKEN_LAST);
74 } else {
75 grn_token_status status;
76 if (tokenizer->tail == p) {
77 status = GRN_TOKEN_LAST;
78 } else {
79 status = GRN_TOKEN_CONTINUE;
80 }
81 grn_tokenizer_token_push(ctx, &(tokenizer->token),
82 (const char *)tokenizer->curr, tokenizer->unit,
83 status);
84 tokenizer->curr = p;
85 }
86 return NULL;
87}
88
89static grn_obj *
90uvector_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
91{
92 grn_uvector_tokenizer *tokenizer = user_data->ptr;
93 if (!tokenizer) {
94 return NULL;
95 }
96 grn_tokenizer_token_fin(ctx, &(tokenizer->token));
97 GRN_FREE(tokenizer);
98 return NULL;
99}
100
101typedef struct {
102 const uint8_t *delimiter;
103 uint32_t delimiter_len;
104 const unsigned char *next;
105 const unsigned char *end;
106 grn_tokenizer_token token;
107 grn_tokenizer_query *query;
108 grn_bool have_tokenized_delimiter;
109} grn_delimited_tokenizer;
110
111static grn_obj *
112delimited_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data,
113 const uint8_t *delimiter, uint32_t delimiter_len)
114{
115 grn_tokenizer_query *query;
116 unsigned int normalize_flags = 0;
117 const char *normalized;
118 unsigned int normalized_length_in_bytes;
119 grn_delimited_tokenizer *tokenizer;
120
121 query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
122 if (!query) {
123 return NULL;
124 }
125
126 if (!(tokenizer = GRN_MALLOC(sizeof(grn_delimited_tokenizer)))) {
127 ERR(GRN_NO_MEMORY_AVAILABLE,
128 "[tokenizer][delimit] "
129 "memory allocation to grn_delimited_tokenizer failed");
130 grn_tokenizer_query_close(ctx, query);
131 return NULL;
132 }
133 user_data->ptr = tokenizer;
134
135 tokenizer->query = query;
136
137 tokenizer->have_tokenized_delimiter =
138 grn_tokenizer_have_tokenized_delimiter(ctx,
139 tokenizer->query->ptr,
140 tokenizer->query->length,
141 tokenizer->query->encoding);
142 tokenizer->delimiter = delimiter;
143 tokenizer->delimiter_len = delimiter_len;
144 grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
145 &normalized, &normalized_length_in_bytes,
146 NULL);
147 tokenizer->next = (const unsigned char *)normalized;
148 tokenizer->end = tokenizer->next + normalized_length_in_bytes;
149
150 grn_tokenizer_token_init(ctx, &(tokenizer->token));
151
152 return NULL;
153}
154
155static grn_obj *
156delimited_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
157{
158 grn_delimited_tokenizer *tokenizer = user_data->ptr;
159
160 if (tokenizer->have_tokenized_delimiter) {
161 unsigned int rest_length;
162 rest_length = tokenizer->end - tokenizer->next;
163 tokenizer->next =
164 (unsigned char *)grn_tokenizer_tokenized_delimiter_next(
165 ctx,
166 &(tokenizer->token),
167 (const char *)tokenizer->next,
168 rest_length,
169 tokenizer->query->encoding);
170 } else {
171 size_t cl;
172 const unsigned char *p = tokenizer->next, *r;
173 const unsigned char *e = tokenizer->end;
174 grn_token_status status;
175 for (r = p; r < e; r += cl) {
176 if (!(cl = grn_charlen_(ctx, (char *)r, (char *)e,
177 tokenizer->query->encoding))) {
178 tokenizer->next = (unsigned char *)e;
179 break;
180 }
181 {
182 grn_bool found_delimiter = GRN_FALSE;
183 const unsigned char *current_end = r;
184 while (current_end + tokenizer->delimiter_len <= e &&
185 !memcmp(current_end,
186 tokenizer->delimiter, tokenizer->delimiter_len)) {
187 current_end += tokenizer->delimiter_len;
188 tokenizer->next = current_end;
189 found_delimiter = GRN_TRUE;
190 }
191 if (found_delimiter) {
192 break;
193 }
194 }
195 }
196 if (r == e) {
197 status = GRN_TOKEN_LAST;
198 } else {
199 status = GRN_TOKEN_CONTINUE;
200 }
201 grn_tokenizer_token_push(ctx,
202 &(tokenizer->token),
203 (const char *)p,
204 r - p,
205 status);
206 }
207
208 return NULL;
209}
210
211static grn_obj *
212delimited_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
213{
214 grn_delimited_tokenizer *tokenizer = user_data->ptr;
215 if (!tokenizer) {
216 return NULL;
217 }
218 grn_tokenizer_query_close(ctx, tokenizer->query);
219 grn_tokenizer_token_fin(ctx, &(tokenizer->token));
220 GRN_FREE(tokenizer);
221 return NULL;
222}
223
224static grn_obj *
225delimit_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
226{
227 static const uint8_t delimiter[1] = {' '};
228 return delimited_init(ctx, nargs, args, user_data, delimiter, 1);
229}
230
231static grn_obj *
232delimit_null_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
233{
234 static const uint8_t delimiter[1] = {'\0'};
235 return delimited_init(ctx, nargs, args, user_data, delimiter, 1);
236}
237
238/* ngram tokenizer */
239
240static grn_bool grn_ngram_tokenizer_remove_blank_disable = GRN_FALSE;
241
242typedef struct {
243 grn_tokenizer_token token;
244 grn_tokenizer_query *query;
245 uint8_t uni_alpha;
246 uint8_t uni_digit;
247 uint8_t uni_symbol;
248 uint8_t ngram_unit;
249 uint8_t ignore_blank;
250 uint8_t overlap;
251 int32_t pos;
252 uint32_t skip;
253 const unsigned char *next;
254 const unsigned char *end;
255 const uint_least8_t *ctypes;
256 uint32_t len;
257 uint32_t tail;
258} grn_ngram_tokenizer;
259
260static grn_obj *
261ngram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data, uint8_t ngram_unit,
262 uint8_t uni_alpha, uint8_t uni_digit, uint8_t uni_symbol, uint8_t ignore_blank)
263{
264 unsigned int normalize_flags =
265 GRN_STRING_REMOVE_BLANK |
266 GRN_STRING_WITH_TYPES |
267 GRN_STRING_REMOVE_TOKENIZED_DELIMITER;
268 grn_tokenizer_query *query;
269 const char *normalized;
270 unsigned int normalized_length_in_bytes;
271 grn_ngram_tokenizer *tokenizer;
272
273 if (grn_ngram_tokenizer_remove_blank_disable) {
274 normalize_flags &= ~GRN_STRING_REMOVE_BLANK;
275 }
276 query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
277 if (!query) {
278 return NULL;
279 }
280
281 if (!(tokenizer = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) {
282 grn_tokenizer_query_close(ctx, query);
283 ERR(GRN_NO_MEMORY_AVAILABLE,
284 "[tokenizer][ngram] "
285 "memory allocation to grn_ngram_tokenizer failed");
286 return NULL;
287 }
288 user_data->ptr = tokenizer;
289
290 grn_tokenizer_token_init(ctx, &(tokenizer->token));
291 tokenizer->query = query;
292
293 tokenizer->uni_alpha = uni_alpha;
294 tokenizer->uni_digit = uni_digit;
295 tokenizer->uni_symbol = uni_symbol;
296 tokenizer->ngram_unit = ngram_unit;
297 tokenizer->ignore_blank = ignore_blank;
298 tokenizer->overlap = 0;
299 tokenizer->pos = 0;
300 tokenizer->skip = 0;
301
302 grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
303 &normalized, &normalized_length_in_bytes,
304 &(tokenizer->len));
305 tokenizer->next = (const unsigned char *)normalized;
306 tokenizer->end = tokenizer->next + normalized_length_in_bytes;
307 tokenizer->ctypes =
308 grn_string_get_types(ctx, tokenizer->query->normalized_query);
309 return NULL;
310}
311
312static grn_obj *
313unigram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
314{ return ngram_init(ctx, nargs, args, user_data, 1, 1, 1, 1, 0); }
315
316static grn_obj *
317bigram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
318{ return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 1, 0); }
319
320static grn_obj *
321trigram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
322{ return ngram_init(ctx, nargs, args, user_data, 3, 1, 1, 1, 0); }
323
324static grn_obj *
325bigrams_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
326{ return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 0, 0); }
327
328static grn_obj *
329bigramsa_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
330{ return ngram_init(ctx, nargs, args, user_data, 2, 0, 1, 0, 0); }
331
332static grn_obj *
333bigramsad_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
334{ return ngram_init(ctx, nargs, args, user_data, 2, 0, 0, 0, 0); }
335
336static grn_obj *
337bigrami_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
338{ return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 1, 1); }
339
340static grn_obj *
341bigramis_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
342{ return ngram_init(ctx, nargs, args, user_data, 2, 1, 1, 0, 1); }
343
344static grn_obj *
345bigramisa_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
346{ return ngram_init(ctx, nargs, args, user_data, 2, 0, 1, 0, 1); }
347
348static grn_obj *
349bigramisad_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
350{ return ngram_init(ctx, nargs, args, user_data, 2, 0, 0, 0, 1); }
351
352static grn_obj *
353ngram_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
354{
355 size_t cl;
356 grn_ngram_tokenizer *tokenizer = user_data->ptr;
357 const unsigned char *p = tokenizer->next, *r = p, *e = tokenizer->end;
358 int32_t len = 0, pos = tokenizer->pos + tokenizer->skip;
359 grn_token_status status = 0;
360 const uint_least8_t *cp = tokenizer->ctypes ? tokenizer->ctypes + pos : NULL;
361 if (cp && tokenizer->uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) {
362 while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
363 tokenizer->query->encoding))) {
364 len++;
365 r += cl;
366 if (/* !tokenizer->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; }
367 if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_ALPHA) { break; }
368 }
369 tokenizer->next = r;
370 tokenizer->overlap = 0;
371 } else if (cp &&
372 tokenizer->uni_digit &&
373 GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) {
374 while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
375 tokenizer->query->encoding))) {
376 len++;
377 r += cl;
378 if (/* !tokenizer->ignore_blank && */ GRN_STR_ISBLANK(*cp)) { break; }
379 if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_DIGIT) { break; }
380 }
381 tokenizer->next = r;
382 tokenizer->overlap = 0;
383 } else if (cp &&
384 tokenizer->uni_symbol &&
385 GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL) {
386 while ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
387 tokenizer->query->encoding))) {
388 len++;
389 r += cl;
390 if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; }
391 if (GRN_STR_CTYPE(*++cp) != GRN_CHAR_SYMBOL) { break; }
392 }
393 tokenizer->next = r;
394 tokenizer->overlap = 0;
395 } else {
396#ifdef PRE_DEFINED_UNSPLIT_WORDS
397 const unsigned char *key = NULL;
398 // todo : grn_pat_lcp_search
399 if ((tid = grn_sym_common_prefix_search(sym, p))) {
400 if (!(key = _grn_sym_key(sym, tid))) {
401 tokenizer->status = GRN_TOKEN_CURSOR_NOT_FOUND;
402 return NULL;
403 }
404 len = grn_str_len(key, tokenizer->query->encoding, NULL);
405 }
406 r = p + grn_charlen_(ctx, p, e, tokenizer->query->encoding);
407 if (tid && (len > 1 || r == p)) {
408 if (r != p && pos + len - 1 <= tokenizer->tail) { continue; }
409 p += strlen(key);
410 if (!*p && tokenizer->mode == GRN_TOKEN_GET) {
411 tokenizer->status = GRN_TOKEN_CURSOR_DONE;
412 }
413 }
414#endif /* PRE_DEFINED_UNSPLIT_WORDS */
415 if ((cl = grn_charlen_(ctx, (char *)r, (char *)e,
416 tokenizer->query->encoding))) {
417 len++;
418 r += cl;
419 tokenizer->next = r;
420 while (len < tokenizer->ngram_unit &&
421 (cl = grn_charlen_(ctx, (char *)r, (char *)e,
422 tokenizer->query->encoding))) {
423 if (cp) {
424 if (!tokenizer->ignore_blank && GRN_STR_ISBLANK(*cp)) { break; }
425 cp++;
426 if ((tokenizer->uni_alpha && GRN_STR_CTYPE(*cp) == GRN_CHAR_ALPHA) ||
427 (tokenizer->uni_digit && GRN_STR_CTYPE(*cp) == GRN_CHAR_DIGIT) ||
428 (tokenizer->uni_symbol && GRN_STR_CTYPE(*cp) == GRN_CHAR_SYMBOL)) {
429 break;
430 }
431 }
432 len++;
433 r += cl;
434 }
435 if (tokenizer->overlap) {
436 status |= GRN_TOKEN_OVERLAP;
437 }
438 if (len < tokenizer->ngram_unit) {
439 status |= GRN_TOKEN_UNMATURED;
440 }
441 tokenizer->overlap = (len > 1) ? 1 : 0;
442 }
443 }
444 tokenizer->pos = pos;
445 tokenizer->len = len;
446 tokenizer->tail = pos + len - 1;
447 if (p == r || tokenizer->next == e) {
448 tokenizer->skip = 0;
449 status |= GRN_TOKEN_LAST;
450 } else {
451 tokenizer->skip = tokenizer->overlap ? 1 : len;
452 }
453 if (r == e) { status |= GRN_TOKEN_REACH_END; }
454 grn_tokenizer_token_push(ctx,
455 &(tokenizer->token),
456 (const char *)p,
457 r - p,
458 status);
459 return NULL;
460}
461
462static grn_obj *
463ngram_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
464{
465 grn_ngram_tokenizer *tokenizer = user_data->ptr;
466 if (!tokenizer) {
467 return NULL;
468 }
469 grn_tokenizer_token_fin(ctx, &(tokenizer->token));
470 grn_tokenizer_query_close(ctx, tokenizer->query);
471 GRN_FREE(tokenizer);
472 return NULL;
473}
474
475/* regexp tokenizer */
476
477typedef struct {
478 grn_tokenizer_token token;
479 grn_tokenizer_query *query;
480 struct {
481 int32_t n_skip_tokens;
482 } get;
483 grn_bool is_begin;
484 grn_bool is_end;
485 grn_bool is_start_token;
486 grn_bool is_overlapping;
487 const char *next;
488 const char *end;
489 unsigned int nth_char;
490 const uint_least8_t *char_types;
491 grn_obj buffer;
492} grn_regexp_tokenizer;
493
494static grn_obj *
495regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
496{
497 unsigned int normalize_flags = GRN_STRING_WITH_TYPES;
498 grn_tokenizer_query *query;
499 const char *normalized;
500 unsigned int normalized_length_in_bytes;
501 grn_regexp_tokenizer *tokenizer;
502
503 query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
504 if (!query) {
505 return NULL;
506 }
507
508 tokenizer = GRN_MALLOC(sizeof(grn_regexp_tokenizer));
509 if (!tokenizer) {
510 grn_tokenizer_query_close(ctx, query);
511 ERR(GRN_NO_MEMORY_AVAILABLE,
512 "[tokenizer][regexp] failed to allocate memory");
513 return NULL;
514 }
515 user_data->ptr = tokenizer;
516
517 grn_tokenizer_token_init(ctx, &(tokenizer->token));
518 tokenizer->query = query;
519
520 tokenizer->get.n_skip_tokens = 0;
521
522 tokenizer->is_begin = GRN_TRUE;
523 tokenizer->is_end = GRN_FALSE;
524 tokenizer->is_start_token = GRN_TRUE;
525 tokenizer->is_overlapping = GRN_FALSE;
526
527 grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
528 &normalized, &normalized_length_in_bytes,
529 NULL);
530 tokenizer->next = normalized;
531 tokenizer->end = tokenizer->next + normalized_length_in_bytes;
532 tokenizer->nth_char = 0;
533 tokenizer->char_types =
534 grn_string_get_types(ctx, tokenizer->query->normalized_query);
535
536 GRN_TEXT_INIT(&(tokenizer->buffer), 0);
537
538 return NULL;
539}
540
541static grn_obj *
542regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
543{
544 int char_len;
545 grn_token_status status = 0;
546 grn_regexp_tokenizer *tokenizer = user_data->ptr;
547 unsigned int n_characters = 0;
548 int ngram_unit = 2;
549 grn_obj *buffer = &(tokenizer->buffer);
550 const char *current = tokenizer->next;
551 const char *end = tokenizer->end;
552 const uint_least8_t *char_types = tokenizer->char_types;
553 grn_tokenize_mode mode = tokenizer->query->tokenize_mode;
554 grn_bool is_begin = tokenizer->is_begin;
555 grn_bool is_start_token = tokenizer->is_start_token;
556 grn_bool break_by_blank = GRN_FALSE;
557 grn_bool break_by_end_mark = GRN_FALSE;
558
559 GRN_BULK_REWIND(buffer);
560 tokenizer->is_begin = GRN_FALSE;
561 tokenizer->is_start_token = GRN_FALSE;
562
563 if (char_types) {
564 char_types += tokenizer->nth_char;
565 }
566
567 if (mode != GRN_TOKEN_GET) {
568 if (is_begin) {
569 grn_tokenizer_token_push(ctx,
570 &(tokenizer->token),
571 GRN_TOKENIZER_BEGIN_MARK_UTF8,
572 GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN,
573 status);
574 return NULL;
575 }
576
577 if (tokenizer->is_end) {
578 status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
579 grn_tokenizer_token_push(ctx,
580 &(tokenizer->token),
581 GRN_TOKENIZER_END_MARK_UTF8,
582 GRN_TOKENIZER_END_MARK_UTF8_LEN,
583 status);
584 return NULL;
585 }
586 if (is_start_token) {
587 if (char_types && GRN_STR_ISBLANK(char_types[-1])) {
588 status |= GRN_TOKEN_SKIP;
589 grn_tokenizer_token_push(ctx, &(tokenizer->token), "", 0, status);
590 return NULL;
591 }
592 }
593 }
594
595 char_len = grn_charlen_(ctx, current, end, tokenizer->query->encoding);
596 if (char_len == 0) {
597 status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
598 grn_tokenizer_token_push(ctx, &(tokenizer->token), "", 0, status);
599 return NULL;
600 }
601
602 if (mode == GRN_TOKEN_GET) {
603 if (is_begin &&
604 char_len == GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN &&
605 memcmp(current, GRN_TOKENIZER_BEGIN_MARK_UTF8, char_len) == 0) {
606 tokenizer->is_start_token = GRN_TRUE;
607 n_characters++;
608 GRN_TEXT_PUT(ctx, buffer, current, char_len);
609 current += char_len;
610 tokenizer->next = current;
611 tokenizer->nth_char++;
612 if (current == end) {
613 status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
614 }
615 grn_tokenizer_token_push(ctx,
616 &(tokenizer->token),
617 GRN_TOKENIZER_BEGIN_MARK_UTF8,
618 GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN,
619 status);
620 return NULL;
621 }
622
623 if (current + char_len == end &&
624 char_len == GRN_TOKENIZER_END_MARK_UTF8_LEN &&
625 memcmp(current, GRN_TOKENIZER_END_MARK_UTF8, char_len) == 0) {
626 status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
627 grn_tokenizer_token_push(ctx,
628 &(tokenizer->token),
629 GRN_TOKENIZER_END_MARK_UTF8,
630 GRN_TOKENIZER_END_MARK_UTF8_LEN,
631 status);
632 return NULL;
633 }
634 }
635
636 while (GRN_TRUE) {
637 n_characters++;
638 GRN_TEXT_PUT(ctx, buffer, current, char_len);
639 current += char_len;
640 if (n_characters == 1) {
641 tokenizer->next = current;
642 tokenizer->nth_char++;
643 }
644
645 if (char_types) {
646 uint_least8_t char_type;
647 char_type = char_types[0];
648 char_types++;
649 if (GRN_STR_ISBLANK(char_type)) {
650 break_by_blank = GRN_TRUE;
651 }
652 }
653
654 char_len = grn_charlen_(ctx, (const char *)current, (const char *)end,
655 tokenizer->query->encoding);
656 if (char_len == 0) {
657 break;
658 }
659
660 if (mode == GRN_TOKEN_GET &&
661 current + char_len == end &&
662 char_len == GRN_TOKENIZER_END_MARK_UTF8_LEN &&
663 memcmp(current, GRN_TOKENIZER_END_MARK_UTF8, char_len) == 0) {
664 break_by_end_mark = GRN_TRUE;
665 }
666
667 if (break_by_blank || break_by_end_mark) {
668 break;
669 }
670
671 if (n_characters == ngram_unit) {
672 break;
673 }
674 }
675
676 if (tokenizer->is_overlapping) {
677 status |= GRN_TOKEN_OVERLAP;
678 }
679 if (n_characters < ngram_unit) {
680 status |= GRN_TOKEN_UNMATURED;
681 }
682 tokenizer->is_overlapping = (n_characters > 1);
683
684 if (mode == GRN_TOKEN_GET) {
685 if (current == end) {
686 tokenizer->is_end = GRN_TRUE;
687 status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
688 if (status & GRN_TOKEN_UNMATURED) {
689 status |= GRN_TOKEN_FORCE_PREFIX;
690 }
691 } else {
692 if (break_by_blank) {
693 tokenizer->get.n_skip_tokens = 0;
694 tokenizer->is_start_token = GRN_TRUE;
695 } else if (break_by_end_mark) {
696 if (!is_start_token && (status & GRN_TOKEN_UNMATURED)) {
697 status |= GRN_TOKEN_SKIP;
698 }
699 } else if (tokenizer->get.n_skip_tokens > 0) {
700 tokenizer->get.n_skip_tokens--;
701 status |= GRN_TOKEN_SKIP;
702 } else {
703 tokenizer->get.n_skip_tokens = ngram_unit - 1;
704 }
705 }
706 } else {
707 if (tokenizer->next == end) {
708 tokenizer->is_end = GRN_TRUE;
709 }
710 if (break_by_blank) {
711 tokenizer->is_start_token = GRN_TRUE;
712 }
713 }
714
715 grn_tokenizer_token_push(ctx,
716 &(tokenizer->token),
717 GRN_TEXT_VALUE(buffer),
718 GRN_TEXT_LEN(buffer),
719 status);
720
721 return NULL;
722}
723
724static grn_obj *
725regexp_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
726{
727 grn_regexp_tokenizer *tokenizer = user_data->ptr;
728 if (!tokenizer) {
729 return NULL;
730 }
731 grn_tokenizer_token_fin(ctx, &(tokenizer->token));
732 grn_tokenizer_query_close(ctx, tokenizer->query);
733 GRN_OBJ_FIN(ctx, &(tokenizer->buffer));
734 GRN_FREE(tokenizer);
735 return NULL;
736}
737
738/* external */
739
740grn_rc
741grn_tokenizers_init(void)
742{
743 static grn_proc _grn_tokenizer_uvector;
744 _grn_tokenizer_uvector.obj.db = NULL;
745 _grn_tokenizer_uvector.obj.id = GRN_ID_NIL;
746 _grn_tokenizer_uvector.obj.header.domain = GRN_ID_NIL;
747 _grn_tokenizer_uvector.obj.range = GRN_ID_NIL;
748 _grn_tokenizer_uvector.funcs[PROC_INIT] = uvector_init;
749 _grn_tokenizer_uvector.funcs[PROC_NEXT] = uvector_next;
750 _grn_tokenizer_uvector.funcs[PROC_FIN] = uvector_fin;
751 grn_tokenizer_uvector = (grn_obj *)&_grn_tokenizer_uvector;
752 return GRN_SUCCESS;
753}
754
755grn_rc
756grn_tokenizers_fin(void)
757{
758 return GRN_SUCCESS;
759}
760
761grn_rc
762grn_db_init_mecab_tokenizer(grn_ctx *ctx)
763{
764 switch (GRN_CTX_GET_ENCODING(ctx)) {
765 case GRN_ENC_EUC_JP :
766 case GRN_ENC_UTF8 :
767 case GRN_ENC_SJIS :
768#if defined(GRN_EMBEDDED) && defined(GRN_WITH_MECAB)
769 {
770 GRN_PLUGIN_DECLARE_FUNCTIONS(tokenizers_mecab);
771 grn_rc rc;
772 rc = GRN_PLUGIN_IMPL_NAME_TAGGED(init, tokenizers_mecab)(ctx);
773 if (rc == GRN_SUCCESS) {
774 rc = GRN_PLUGIN_IMPL_NAME_TAGGED(register, tokenizers_mecab)(ctx);
775 if (rc != GRN_SUCCESS) {
776 GRN_PLUGIN_IMPL_NAME_TAGGED(fin, tokenizers_mecab)(ctx);
777 }
778 }
779 return rc;
780 }
781#else /* defined(GRN_EMBEDDED) && defined(GRN_WITH_MECAB) */
782 {
783 const char *mecab_plugin_name = "tokenizers/mecab";
784 char *path;
785 path = grn_plugin_find_path(ctx, mecab_plugin_name);
786 if (path) {
787 GRN_FREE(path);
788 return grn_plugin_register(ctx, mecab_plugin_name);
789 } else {
790 return GRN_NO_SUCH_FILE_OR_DIRECTORY;
791 }
792 }
793#endif /* defined(GRN_EMBEDDED) && defined(GRN_WITH_MECAB) */
794 break;
795 default :
796 return GRN_OPERATION_NOT_SUPPORTED;
797 }
798}
799
800#define DEF_TOKENIZER(name, init, next, fin, vars)\
801 (grn_proc_create(ctx, (name), (sizeof(name) - 1),\
802 GRN_PROC_TOKENIZER, (init), (next), (fin), 3, (vars)))
803
804grn_rc
805grn_db_init_builtin_tokenizers(grn_ctx *ctx)
806{
807 grn_obj *obj;
808 grn_expr_var vars[] = {
809 {NULL, 0},
810 {NULL, 0},
811 {NULL, 0}
812 };
813 GRN_TEXT_INIT(&vars[0].value, 0);
814 GRN_TEXT_INIT(&vars[1].value, 0);
815 GRN_UINT32_INIT(&vars[2].value, 0);
816
817 {
818 char grn_ngram_tokenizer_remove_blank_disable_env[GRN_ENV_BUFFER_SIZE];
819
820 grn_getenv("GRN_NGRAM_TOKENIZER_REMOVE_BLANK_DISABLE",
821 grn_ngram_tokenizer_remove_blank_disable_env,
822 GRN_ENV_BUFFER_SIZE);
823 if (grn_ngram_tokenizer_remove_blank_disable_env[0]) {
824 grn_ngram_tokenizer_remove_blank_disable = GRN_TRUE;
825 }
826 }
827
828 obj = DEF_TOKENIZER("TokenDelimit",
829 delimit_init, delimited_next, delimited_fin, vars);
830 if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_DELIMIT) { return GRN_FILE_CORRUPT; }
831 obj = DEF_TOKENIZER("TokenUnigram",
832 unigram_init, ngram_next, ngram_fin, vars);
833 if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_UNIGRAM) { return GRN_FILE_CORRUPT; }
834 obj = DEF_TOKENIZER("TokenBigram",
835 bigram_init, ngram_next, ngram_fin, vars);
836 if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_BIGRAM) { return GRN_FILE_CORRUPT; }
837 obj = DEF_TOKENIZER("TokenTrigram",
838 trigram_init, ngram_next, ngram_fin, vars);
839 if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_TRIGRAM) { return GRN_FILE_CORRUPT; }
840
841 DEF_TOKENIZER("TokenBigramSplitSymbol",
842 bigrams_init, ngram_next, ngram_fin, vars);
843 DEF_TOKENIZER("TokenBigramSplitSymbolAlpha",
844 bigramsa_init, ngram_next, ngram_fin, vars);
845 DEF_TOKENIZER("TokenBigramSplitSymbolAlphaDigit",
846 bigramsad_init, ngram_next, ngram_fin, vars);
847 DEF_TOKENIZER("TokenBigramIgnoreBlank",
848 bigrami_init, ngram_next, ngram_fin, vars);
849 DEF_TOKENIZER("TokenBigramIgnoreBlankSplitSymbol",
850 bigramis_init, ngram_next, ngram_fin, vars);
851 DEF_TOKENIZER("TokenBigramIgnoreBlankSplitSymbolAlpha",
852 bigramisa_init, ngram_next, ngram_fin, vars);
853 DEF_TOKENIZER("TokenBigramIgnoreBlankSplitSymbolAlphaDigit",
854 bigramisad_init, ngram_next, ngram_fin, vars);
855 DEF_TOKENIZER("TokenDelimitNull",
856 delimit_null_init, delimited_next, delimited_fin, vars);
857 DEF_TOKENIZER("TokenRegexp",
858 regexp_init, regexp_next, regexp_fin, vars);
859 return GRN_SUCCESS;
860}
861