1/* -*- c-basic-offset: 2 -*- */
2/*
3 Copyright(C) 2009-2017 Brazil
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License version 2.1 as published by the Free Software Foundation.
8
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public
15 License along with this library; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18#include "grn_token_cursor.h"
19#include "grn_string.h"
20#include "grn_pat.h"
21#include "grn_dat.h"
22
23static void
24grn_token_cursor_open_initialize_token_filters(grn_ctx *ctx,
25 grn_token_cursor *token_cursor)
26{
27 grn_obj *token_filters = token_cursor->token_filter.objects;
28 unsigned int i, n_token_filters;
29
30 token_cursor->token_filter.data = NULL;
31
32 if (token_filters) {
33 n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *);
34 } else {
35 n_token_filters = 0;
36 }
37
38 if (n_token_filters == 0) {
39 return;
40 }
41
42 token_cursor->token_filter.data = GRN_CALLOC(sizeof(void *) * n_token_filters);
43 if (!token_cursor->token_filter.data) {
44 return;
45 }
46
47 for (i = 0; i < n_token_filters; i++) {
48 grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i);
49 grn_proc *token_filter = (grn_proc *)token_filter_object;
50
51 token_cursor->token_filter.data[i] =
52 token_filter->callbacks.token_filter.init(ctx,
53 token_cursor->table,
54 token_cursor->mode);
55 }
56}
57
58grn_token_cursor *
59grn_token_cursor_open(grn_ctx *ctx, grn_obj *table,
60 const char *str, size_t str_len,
61 grn_tokenize_mode mode, unsigned int flags)
62{
63 grn_token_cursor *token_cursor;
64 grn_encoding encoding;
65 grn_obj *tokenizer;
66 grn_obj *normalizer;
67 grn_obj *token_filters;
68 grn_table_flags table_flags;
69 if (grn_table_get_info(ctx, table, &table_flags, &encoding, &tokenizer,
70 &normalizer, &token_filters)) {
71 return NULL;
72 }
73 if (!(token_cursor = GRN_MALLOC(sizeof(grn_token_cursor)))) { return NULL; }
74 token_cursor->table = table;
75 token_cursor->mode = mode;
76 token_cursor->encoding = encoding;
77 token_cursor->tokenizer = tokenizer;
78 token_cursor->token_filter.objects = token_filters;
79 token_cursor->token_filter.data = NULL;
80 token_cursor->orig = (const unsigned char *)str;
81 token_cursor->orig_blen = str_len;
82 token_cursor->curr = NULL;
83 token_cursor->nstr = NULL;
84 token_cursor->curr_size = 0;
85 token_cursor->pos = -1;
86 token_cursor->status = GRN_TOKEN_CURSOR_DOING;
87 token_cursor->force_prefix = GRN_FALSE;
88 if (tokenizer) {
89 grn_obj str_, flags_, mode_;
90 GRN_TEXT_INIT(&str_, GRN_OBJ_DO_SHALLOW_COPY);
91 GRN_TEXT_SET_REF(&str_, str, str_len);
92 GRN_UINT32_INIT(&flags_, 0);
93 GRN_UINT32_SET(ctx, &flags_, flags);
94 GRN_UINT32_INIT(&mode_, 0);
95 GRN_UINT32_SET(ctx, &mode_, mode);
96 token_cursor->pctx.caller = NULL;
97 token_cursor->pctx.user_data.ptr = NULL;
98 token_cursor->pctx.proc = (grn_proc *)tokenizer;
99 token_cursor->pctx.hooks = NULL;
100 token_cursor->pctx.currh = NULL;
101 token_cursor->pctx.phase = PROC_INIT;
102 grn_ctx_push(ctx, &mode_);
103 grn_ctx_push(ctx, &str_);
104 grn_ctx_push(ctx, &flags_);
105 ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token_cursor->pctx.user_data);
106 grn_obj_close(ctx, &flags_);
107 grn_obj_close(ctx, &str_);
108 grn_obj_close(ctx, &mode_);
109 } else {
110 int nflags = 0;
111 token_cursor->nstr = grn_string_open_(ctx, str, str_len,
112 normalizer,
113 nflags,
114 token_cursor->encoding);
115 if (token_cursor->nstr) {
116 const char *normalized;
117 grn_string_get_normalized(ctx, token_cursor->nstr,
118 &normalized, &(token_cursor->curr_size), NULL);
119 token_cursor->curr = (const unsigned char *)normalized;
120 } else {
121 ERR(GRN_TOKENIZER_ERROR,
122 "[token-cursor][open] failed to grn_string_open()");
123 }
124 }
125
126 if (ctx->rc == GRN_SUCCESS) {
127 grn_token_cursor_open_initialize_token_filters(ctx, token_cursor);
128 }
129
130 if (ctx->rc) {
131 grn_token_cursor_close(ctx, token_cursor);
132 token_cursor = NULL;
133 }
134 return token_cursor;
135}
136
137static int
138grn_token_cursor_next_apply_token_filters(grn_ctx *ctx,
139 grn_token_cursor *token_cursor,
140 grn_obj *current_token_data,
141 grn_obj *status)
142{
143 grn_obj *token_filters = token_cursor->token_filter.objects;
144 unsigned int i, n_token_filters;
145 grn_token current_token;
146 grn_token next_token;
147
148 if (token_filters) {
149 n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *);
150 } else {
151 n_token_filters = 0;
152 }
153
154 GRN_TEXT_INIT(&(current_token.data), GRN_OBJ_DO_SHALLOW_COPY);
155 GRN_TEXT_SET(ctx, &(current_token.data),
156 GRN_TEXT_VALUE(current_token_data),
157 GRN_TEXT_LEN(current_token_data));
158 current_token.status = GRN_INT32_VALUE(status);
159 GRN_TEXT_INIT(&(next_token.data), GRN_OBJ_DO_SHALLOW_COPY);
160 GRN_TEXT_SET(ctx, &(next_token.data),
161 GRN_TEXT_VALUE(&(current_token.data)),
162 GRN_TEXT_LEN(&(current_token.data)));
163 next_token.status = current_token.status;
164
165 for (i = 0; i < n_token_filters; i++) {
166 grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i);
167 grn_proc *token_filter = (grn_proc *)token_filter_object;
168 void *data = token_cursor->token_filter.data[i];
169
170#define SKIP_FLAGS\
171 (GRN_TOKEN_SKIP |\
172 GRN_TOKEN_SKIP_WITH_POSITION)
173 if (current_token.status & SKIP_FLAGS) {
174 break;
175 }
176#undef SKIP_FLAGS
177
178 token_filter->callbacks.token_filter.filter(ctx,
179 &current_token,
180 &next_token,
181 data);
182 GRN_TEXT_SET(ctx, &(current_token.data),
183 GRN_TEXT_VALUE(&(next_token.data)),
184 GRN_TEXT_LEN(&(next_token.data)));
185 current_token.status = next_token.status;
186 }
187
188 token_cursor->curr =
189 (const unsigned char *)GRN_TEXT_VALUE(&(current_token.data));
190 token_cursor->curr_size = GRN_TEXT_LEN(&(current_token.data));
191
192 return current_token.status;
193}
194
195grn_id
196grn_token_cursor_next(grn_ctx *ctx, grn_token_cursor *token_cursor)
197{
198 int status;
199 grn_id tid = GRN_ID_NIL;
200 grn_obj *table = token_cursor->table;
201 grn_obj *tokenizer = token_cursor->tokenizer;
202 while (token_cursor->status != GRN_TOKEN_CURSOR_DONE) {
203 if (tokenizer) {
204 grn_obj *curr_, *stat_;
205 ((grn_proc *)tokenizer)->funcs[PROC_NEXT](ctx, 1, &table, &token_cursor->pctx.user_data);
206 stat_ = grn_ctx_pop(ctx);
207 curr_ = grn_ctx_pop(ctx);
208 status = grn_token_cursor_next_apply_token_filters(ctx, token_cursor,
209 curr_, stat_);
210 token_cursor->status =
211 ((status & GRN_TOKEN_LAST) ||
212 (token_cursor->mode == GRN_TOKENIZE_GET &&
213 (status & GRN_TOKEN_REACH_END)))
214 ? GRN_TOKEN_CURSOR_DONE : GRN_TOKEN_CURSOR_DOING;
215 token_cursor->force_prefix = GRN_FALSE;
216#define SKIP_FLAGS \
217 (GRN_TOKEN_SKIP | GRN_TOKEN_SKIP_WITH_POSITION)
218 if (status & SKIP_FLAGS) {
219 if (status & GRN_TOKEN_SKIP) {
220 token_cursor->pos++;
221 }
222 if (token_cursor->status == GRN_TOKEN_CURSOR_DONE && tid == GRN_ID_NIL) {
223 token_cursor->status = GRN_TOKEN_CURSOR_DONE_SKIP;
224 break;
225 } else {
226 continue;
227 }
228 }
229#undef SKIP_FLAGS
230 if (status & GRN_TOKEN_FORCE_PREFIX) {
231 token_cursor->force_prefix = GRN_TRUE;
232 }
233 if (token_cursor->curr_size == 0) {
234 if (token_cursor->status != GRN_TOKEN_CURSOR_DONE) {
235 char tokenizer_name[GRN_TABLE_MAX_KEY_SIZE];
236 int tokenizer_name_length;
237 tokenizer_name_length =
238 grn_obj_name(ctx, token_cursor->tokenizer,
239 tokenizer_name, GRN_TABLE_MAX_KEY_SIZE);
240 GRN_LOG(ctx, GRN_WARN,
241 "[token_next] ignore an empty token: <%.*s>: <%.*s>",
242 tokenizer_name_length, tokenizer_name,
243 token_cursor->orig_blen, token_cursor->orig);
244 }
245 continue;
246 }
247 if (token_cursor->curr_size > GRN_TABLE_MAX_KEY_SIZE) {
248 GRN_LOG(ctx, GRN_WARN,
249 "[token_next] ignore too long token. "
250 "Token must be less than or equal to %d: <%d>(<%.*s>)",
251 GRN_TABLE_MAX_KEY_SIZE,
252 token_cursor->curr_size,
253 token_cursor->curr_size, token_cursor->curr);
254 continue;
255 }
256 if (status & GRN_TOKEN_UNMATURED) {
257 if (status & GRN_TOKEN_OVERLAP) {
258 if (token_cursor->mode == GRN_TOKENIZE_GET) {
259 token_cursor->pos++;
260 continue;
261 }
262 } else {
263 if (status & GRN_TOKEN_REACH_END) {
264 token_cursor->force_prefix = GRN_TRUE;
265 }
266 }
267 }
268 } else {
269 token_cursor->status = GRN_TOKEN_CURSOR_DONE;
270 }
271 if (token_cursor->mode == GRN_TOKENIZE_ADD) {
272 switch (table->header.type) {
273 case GRN_TABLE_PAT_KEY :
274 if (grn_io_lock(ctx, ((grn_pat *)table)->io, grn_lock_timeout)) {
275 tid = GRN_ID_NIL;
276 } else {
277 tid = grn_pat_add(ctx, (grn_pat *)table, token_cursor->curr, token_cursor->curr_size,
278 NULL, NULL);
279 grn_io_unlock(((grn_pat *)table)->io);
280 }
281 break;
282 case GRN_TABLE_DAT_KEY :
283 if (grn_io_lock(ctx, ((grn_dat *)table)->io, grn_lock_timeout)) {
284 tid = GRN_ID_NIL;
285 } else {
286 tid = grn_dat_add(ctx, (grn_dat *)table, token_cursor->curr, token_cursor->curr_size,
287 NULL, NULL);
288 grn_io_unlock(((grn_dat *)table)->io);
289 }
290 break;
291 case GRN_TABLE_HASH_KEY :
292 if (grn_io_lock(ctx, ((grn_hash *)table)->io, grn_lock_timeout)) {
293 tid = GRN_ID_NIL;
294 } else {
295 tid = grn_hash_add(ctx, (grn_hash *)table, token_cursor->curr, token_cursor->curr_size,
296 NULL, NULL);
297 grn_io_unlock(((grn_hash *)table)->io);
298 }
299 break;
300 case GRN_TABLE_NO_KEY :
301 if (token_cursor->curr_size == sizeof(grn_id)) {
302 tid = *((grn_id *)token_cursor->curr);
303 } else {
304 tid = GRN_ID_NIL;
305 }
306 break;
307 }
308 } else if (token_cursor->mode != GRN_TOKENIZE_ONLY) {
309 switch (table->header.type) {
310 case GRN_TABLE_PAT_KEY :
311 tid = grn_pat_get(ctx, (grn_pat *)table, token_cursor->curr, token_cursor->curr_size, NULL);
312 break;
313 case GRN_TABLE_DAT_KEY :
314 tid = grn_dat_get(ctx, (grn_dat *)table, token_cursor->curr, token_cursor->curr_size, NULL);
315 break;
316 case GRN_TABLE_HASH_KEY :
317 tid = grn_hash_get(ctx, (grn_hash *)table, token_cursor->curr, token_cursor->curr_size, NULL);
318 break;
319 case GRN_TABLE_NO_KEY :
320 if (token_cursor->curr_size == sizeof(grn_id)) {
321 tid = *((grn_id *)token_cursor->curr);
322 } else {
323 tid = GRN_ID_NIL;
324 }
325 break;
326 }
327 }
328 if (token_cursor->mode != GRN_TOKENIZE_ONLY &&
329 tid == GRN_ID_NIL && token_cursor->status != GRN_TOKEN_CURSOR_DONE) {
330 token_cursor->status = GRN_TOKEN_CURSOR_NOT_FOUND;
331 }
332 token_cursor->pos++;
333 break;
334 }
335 return tid;
336}
337
338static void
339grn_token_cursor_close_token_filters(grn_ctx *ctx,
340 grn_token_cursor *token_cursor)
341{
342 grn_obj *token_filters = token_cursor->token_filter.objects;
343 unsigned int i, n_token_filters;
344
345 if (!token_cursor->token_filter.data) {
346 return;
347 }
348
349 if (token_filters) {
350 n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *);
351 } else {
352 n_token_filters = 0;
353 }
354
355 if (n_token_filters == 0) {
356 return;
357 }
358
359 for (i = 0; i < n_token_filters; i++) {
360 grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i);
361 grn_proc *token_filter = (grn_proc *)token_filter_object;
362 void *data = token_cursor->token_filter.data[i];
363
364 token_filter->callbacks.token_filter.fin(ctx, data);
365 }
366 GRN_FREE(token_cursor->token_filter.data);
367}
368
369grn_rc
370grn_token_cursor_close(grn_ctx *ctx, grn_token_cursor *token_cursor)
371{
372 if (token_cursor) {
373 if (token_cursor->tokenizer) {
374 ((grn_proc *)token_cursor->tokenizer)->funcs[PROC_FIN](ctx, 1, &token_cursor->table,
375 &token_cursor->pctx.user_data);
376 }
377 grn_token_cursor_close_token_filters(ctx, token_cursor);
378 if (token_cursor->nstr) {
379 grn_obj_close(ctx, token_cursor->nstr);
380 }
381 GRN_FREE(token_cursor);
382 return GRN_SUCCESS;
383 } else {
384 return GRN_INVALID_ARGUMENT;
385 }
386}
387