1 | /* -*- c-basic-offset: 2 -*- */ |
2 | /* |
3 | Copyright(C) 2009-2017 Brazil |
4 | |
5 | This library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License version 2.1 as published by the Free Software Foundation. |
8 | |
9 | This library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Lesser General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU Lesser General Public |
15 | License along with this library; if not, write to the Free Software |
16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
17 | */ |
18 | #include "grn_token_cursor.h" |
19 | #include "grn_string.h" |
20 | #include "grn_pat.h" |
21 | #include "grn_dat.h" |
22 | |
23 | static void |
24 | grn_token_cursor_open_initialize_token_filters(grn_ctx *ctx, |
25 | grn_token_cursor *token_cursor) |
26 | { |
27 | grn_obj *token_filters = token_cursor->token_filter.objects; |
28 | unsigned int i, n_token_filters; |
29 | |
30 | token_cursor->token_filter.data = NULL; |
31 | |
32 | if (token_filters) { |
33 | n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *); |
34 | } else { |
35 | n_token_filters = 0; |
36 | } |
37 | |
38 | if (n_token_filters == 0) { |
39 | return; |
40 | } |
41 | |
42 | token_cursor->token_filter.data = GRN_CALLOC(sizeof(void *) * n_token_filters); |
43 | if (!token_cursor->token_filter.data) { |
44 | return; |
45 | } |
46 | |
47 | for (i = 0; i < n_token_filters; i++) { |
48 | grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i); |
49 | grn_proc *token_filter = (grn_proc *)token_filter_object; |
50 | |
51 | token_cursor->token_filter.data[i] = |
52 | token_filter->callbacks.token_filter.init(ctx, |
53 | token_cursor->table, |
54 | token_cursor->mode); |
55 | } |
56 | } |
57 | |
58 | grn_token_cursor * |
59 | grn_token_cursor_open(grn_ctx *ctx, grn_obj *table, |
60 | const char *str, size_t str_len, |
61 | grn_tokenize_mode mode, unsigned int flags) |
62 | { |
63 | grn_token_cursor *token_cursor; |
64 | grn_encoding encoding; |
65 | grn_obj *tokenizer; |
66 | grn_obj *normalizer; |
67 | grn_obj *token_filters; |
68 | grn_table_flags table_flags; |
69 | if (grn_table_get_info(ctx, table, &table_flags, &encoding, &tokenizer, |
70 | &normalizer, &token_filters)) { |
71 | return NULL; |
72 | } |
73 | if (!(token_cursor = GRN_MALLOC(sizeof(grn_token_cursor)))) { return NULL; } |
74 | token_cursor->table = table; |
75 | token_cursor->mode = mode; |
76 | token_cursor->encoding = encoding; |
77 | token_cursor->tokenizer = tokenizer; |
78 | token_cursor->token_filter.objects = token_filters; |
79 | token_cursor->token_filter.data = NULL; |
80 | token_cursor->orig = (const unsigned char *)str; |
81 | token_cursor->orig_blen = str_len; |
82 | token_cursor->curr = NULL; |
83 | token_cursor->nstr = NULL; |
84 | token_cursor->curr_size = 0; |
85 | token_cursor->pos = -1; |
86 | token_cursor->status = GRN_TOKEN_CURSOR_DOING; |
87 | token_cursor->force_prefix = GRN_FALSE; |
88 | if (tokenizer) { |
89 | grn_obj str_, flags_, mode_; |
90 | GRN_TEXT_INIT(&str_, GRN_OBJ_DO_SHALLOW_COPY); |
91 | GRN_TEXT_SET_REF(&str_, str, str_len); |
92 | GRN_UINT32_INIT(&flags_, 0); |
93 | GRN_UINT32_SET(ctx, &flags_, flags); |
94 | GRN_UINT32_INIT(&mode_, 0); |
95 | GRN_UINT32_SET(ctx, &mode_, mode); |
96 | token_cursor->pctx.caller = NULL; |
97 | token_cursor->pctx.user_data.ptr = NULL; |
98 | token_cursor->pctx.proc = (grn_proc *)tokenizer; |
99 | token_cursor->pctx.hooks = NULL; |
100 | token_cursor->pctx.currh = NULL; |
101 | token_cursor->pctx.phase = PROC_INIT; |
102 | grn_ctx_push(ctx, &mode_); |
103 | grn_ctx_push(ctx, &str_); |
104 | grn_ctx_push(ctx, &flags_); |
105 | ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token_cursor->pctx.user_data); |
106 | grn_obj_close(ctx, &flags_); |
107 | grn_obj_close(ctx, &str_); |
108 | grn_obj_close(ctx, &mode_); |
109 | } else { |
110 | int nflags = 0; |
111 | token_cursor->nstr = grn_string_open_(ctx, str, str_len, |
112 | normalizer, |
113 | nflags, |
114 | token_cursor->encoding); |
115 | if (token_cursor->nstr) { |
116 | const char *normalized; |
117 | grn_string_get_normalized(ctx, token_cursor->nstr, |
118 | &normalized, &(token_cursor->curr_size), NULL); |
119 | token_cursor->curr = (const unsigned char *)normalized; |
120 | } else { |
121 | ERR(GRN_TOKENIZER_ERROR, |
122 | "[token-cursor][open] failed to grn_string_open()" ); |
123 | } |
124 | } |
125 | |
126 | if (ctx->rc == GRN_SUCCESS) { |
127 | grn_token_cursor_open_initialize_token_filters(ctx, token_cursor); |
128 | } |
129 | |
130 | if (ctx->rc) { |
131 | grn_token_cursor_close(ctx, token_cursor); |
132 | token_cursor = NULL; |
133 | } |
134 | return token_cursor; |
135 | } |
136 | |
137 | static int |
138 | grn_token_cursor_next_apply_token_filters(grn_ctx *ctx, |
139 | grn_token_cursor *token_cursor, |
140 | grn_obj *current_token_data, |
141 | grn_obj *status) |
142 | { |
143 | grn_obj *token_filters = token_cursor->token_filter.objects; |
144 | unsigned int i, n_token_filters; |
145 | grn_token current_token; |
146 | grn_token next_token; |
147 | |
148 | if (token_filters) { |
149 | n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *); |
150 | } else { |
151 | n_token_filters = 0; |
152 | } |
153 | |
154 | GRN_TEXT_INIT(&(current_token.data), GRN_OBJ_DO_SHALLOW_COPY); |
155 | GRN_TEXT_SET(ctx, &(current_token.data), |
156 | GRN_TEXT_VALUE(current_token_data), |
157 | GRN_TEXT_LEN(current_token_data)); |
158 | current_token.status = GRN_INT32_VALUE(status); |
159 | GRN_TEXT_INIT(&(next_token.data), GRN_OBJ_DO_SHALLOW_COPY); |
160 | GRN_TEXT_SET(ctx, &(next_token.data), |
161 | GRN_TEXT_VALUE(&(current_token.data)), |
162 | GRN_TEXT_LEN(&(current_token.data))); |
163 | next_token.status = current_token.status; |
164 | |
165 | for (i = 0; i < n_token_filters; i++) { |
166 | grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i); |
167 | grn_proc *token_filter = (grn_proc *)token_filter_object; |
168 | void *data = token_cursor->token_filter.data[i]; |
169 | |
170 | #define SKIP_FLAGS\ |
171 | (GRN_TOKEN_SKIP |\ |
172 | GRN_TOKEN_SKIP_WITH_POSITION) |
173 | if (current_token.status & SKIP_FLAGS) { |
174 | break; |
175 | } |
176 | #undef SKIP_FLAGS |
177 | |
178 | token_filter->callbacks.token_filter.filter(ctx, |
179 | ¤t_token, |
180 | &next_token, |
181 | data); |
182 | GRN_TEXT_SET(ctx, &(current_token.data), |
183 | GRN_TEXT_VALUE(&(next_token.data)), |
184 | GRN_TEXT_LEN(&(next_token.data))); |
185 | current_token.status = next_token.status; |
186 | } |
187 | |
188 | token_cursor->curr = |
189 | (const unsigned char *)GRN_TEXT_VALUE(&(current_token.data)); |
190 | token_cursor->curr_size = GRN_TEXT_LEN(&(current_token.data)); |
191 | |
192 | return current_token.status; |
193 | } |
194 | |
195 | grn_id |
196 | grn_token_cursor_next(grn_ctx *ctx, grn_token_cursor *token_cursor) |
197 | { |
198 | int status; |
199 | grn_id tid = GRN_ID_NIL; |
200 | grn_obj *table = token_cursor->table; |
201 | grn_obj *tokenizer = token_cursor->tokenizer; |
202 | while (token_cursor->status != GRN_TOKEN_CURSOR_DONE) { |
203 | if (tokenizer) { |
204 | grn_obj *curr_, *stat_; |
205 | ((grn_proc *)tokenizer)->funcs[PROC_NEXT](ctx, 1, &table, &token_cursor->pctx.user_data); |
206 | stat_ = grn_ctx_pop(ctx); |
207 | curr_ = grn_ctx_pop(ctx); |
208 | status = grn_token_cursor_next_apply_token_filters(ctx, token_cursor, |
209 | curr_, stat_); |
210 | token_cursor->status = |
211 | ((status & GRN_TOKEN_LAST) || |
212 | (token_cursor->mode == GRN_TOKENIZE_GET && |
213 | (status & GRN_TOKEN_REACH_END))) |
214 | ? GRN_TOKEN_CURSOR_DONE : GRN_TOKEN_CURSOR_DOING; |
215 | token_cursor->force_prefix = GRN_FALSE; |
216 | #define SKIP_FLAGS \ |
217 | (GRN_TOKEN_SKIP | GRN_TOKEN_SKIP_WITH_POSITION) |
218 | if (status & SKIP_FLAGS) { |
219 | if (status & GRN_TOKEN_SKIP) { |
220 | token_cursor->pos++; |
221 | } |
222 | if (token_cursor->status == GRN_TOKEN_CURSOR_DONE && tid == GRN_ID_NIL) { |
223 | token_cursor->status = GRN_TOKEN_CURSOR_DONE_SKIP; |
224 | break; |
225 | } else { |
226 | continue; |
227 | } |
228 | } |
229 | #undef SKIP_FLAGS |
230 | if (status & GRN_TOKEN_FORCE_PREFIX) { |
231 | token_cursor->force_prefix = GRN_TRUE; |
232 | } |
233 | if (token_cursor->curr_size == 0) { |
234 | if (token_cursor->status != GRN_TOKEN_CURSOR_DONE) { |
235 | char tokenizer_name[GRN_TABLE_MAX_KEY_SIZE]; |
236 | int tokenizer_name_length; |
237 | tokenizer_name_length = |
238 | grn_obj_name(ctx, token_cursor->tokenizer, |
239 | tokenizer_name, GRN_TABLE_MAX_KEY_SIZE); |
240 | GRN_LOG(ctx, GRN_WARN, |
241 | "[token_next] ignore an empty token: <%.*s>: <%.*s>" , |
242 | tokenizer_name_length, tokenizer_name, |
243 | token_cursor->orig_blen, token_cursor->orig); |
244 | } |
245 | continue; |
246 | } |
247 | if (token_cursor->curr_size > GRN_TABLE_MAX_KEY_SIZE) { |
248 | GRN_LOG(ctx, GRN_WARN, |
249 | "[token_next] ignore too long token. " |
250 | "Token must be less than or equal to %d: <%d>(<%.*s>)" , |
251 | GRN_TABLE_MAX_KEY_SIZE, |
252 | token_cursor->curr_size, |
253 | token_cursor->curr_size, token_cursor->curr); |
254 | continue; |
255 | } |
256 | if (status & GRN_TOKEN_UNMATURED) { |
257 | if (status & GRN_TOKEN_OVERLAP) { |
258 | if (token_cursor->mode == GRN_TOKENIZE_GET) { |
259 | token_cursor->pos++; |
260 | continue; |
261 | } |
262 | } else { |
263 | if (status & GRN_TOKEN_REACH_END) { |
264 | token_cursor->force_prefix = GRN_TRUE; |
265 | } |
266 | } |
267 | } |
268 | } else { |
269 | token_cursor->status = GRN_TOKEN_CURSOR_DONE; |
270 | } |
271 | if (token_cursor->mode == GRN_TOKENIZE_ADD) { |
272 | switch (table->header.type) { |
273 | case GRN_TABLE_PAT_KEY : |
274 | if (grn_io_lock(ctx, ((grn_pat *)table)->io, grn_lock_timeout)) { |
275 | tid = GRN_ID_NIL; |
276 | } else { |
277 | tid = grn_pat_add(ctx, (grn_pat *)table, token_cursor->curr, token_cursor->curr_size, |
278 | NULL, NULL); |
279 | grn_io_unlock(((grn_pat *)table)->io); |
280 | } |
281 | break; |
282 | case GRN_TABLE_DAT_KEY : |
283 | if (grn_io_lock(ctx, ((grn_dat *)table)->io, grn_lock_timeout)) { |
284 | tid = GRN_ID_NIL; |
285 | } else { |
286 | tid = grn_dat_add(ctx, (grn_dat *)table, token_cursor->curr, token_cursor->curr_size, |
287 | NULL, NULL); |
288 | grn_io_unlock(((grn_dat *)table)->io); |
289 | } |
290 | break; |
291 | case GRN_TABLE_HASH_KEY : |
292 | if (grn_io_lock(ctx, ((grn_hash *)table)->io, grn_lock_timeout)) { |
293 | tid = GRN_ID_NIL; |
294 | } else { |
295 | tid = grn_hash_add(ctx, (grn_hash *)table, token_cursor->curr, token_cursor->curr_size, |
296 | NULL, NULL); |
297 | grn_io_unlock(((grn_hash *)table)->io); |
298 | } |
299 | break; |
300 | case GRN_TABLE_NO_KEY : |
301 | if (token_cursor->curr_size == sizeof(grn_id)) { |
302 | tid = *((grn_id *)token_cursor->curr); |
303 | } else { |
304 | tid = GRN_ID_NIL; |
305 | } |
306 | break; |
307 | } |
308 | } else if (token_cursor->mode != GRN_TOKENIZE_ONLY) { |
309 | switch (table->header.type) { |
310 | case GRN_TABLE_PAT_KEY : |
311 | tid = grn_pat_get(ctx, (grn_pat *)table, token_cursor->curr, token_cursor->curr_size, NULL); |
312 | break; |
313 | case GRN_TABLE_DAT_KEY : |
314 | tid = grn_dat_get(ctx, (grn_dat *)table, token_cursor->curr, token_cursor->curr_size, NULL); |
315 | break; |
316 | case GRN_TABLE_HASH_KEY : |
317 | tid = grn_hash_get(ctx, (grn_hash *)table, token_cursor->curr, token_cursor->curr_size, NULL); |
318 | break; |
319 | case GRN_TABLE_NO_KEY : |
320 | if (token_cursor->curr_size == sizeof(grn_id)) { |
321 | tid = *((grn_id *)token_cursor->curr); |
322 | } else { |
323 | tid = GRN_ID_NIL; |
324 | } |
325 | break; |
326 | } |
327 | } |
328 | if (token_cursor->mode != GRN_TOKENIZE_ONLY && |
329 | tid == GRN_ID_NIL && token_cursor->status != GRN_TOKEN_CURSOR_DONE) { |
330 | token_cursor->status = GRN_TOKEN_CURSOR_NOT_FOUND; |
331 | } |
332 | token_cursor->pos++; |
333 | break; |
334 | } |
335 | return tid; |
336 | } |
337 | |
338 | static void |
339 | grn_token_cursor_close_token_filters(grn_ctx *ctx, |
340 | grn_token_cursor *token_cursor) |
341 | { |
342 | grn_obj *token_filters = token_cursor->token_filter.objects; |
343 | unsigned int i, n_token_filters; |
344 | |
345 | if (!token_cursor->token_filter.data) { |
346 | return; |
347 | } |
348 | |
349 | if (token_filters) { |
350 | n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *); |
351 | } else { |
352 | n_token_filters = 0; |
353 | } |
354 | |
355 | if (n_token_filters == 0) { |
356 | return; |
357 | } |
358 | |
359 | for (i = 0; i < n_token_filters; i++) { |
360 | grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i); |
361 | grn_proc *token_filter = (grn_proc *)token_filter_object; |
362 | void *data = token_cursor->token_filter.data[i]; |
363 | |
364 | token_filter->callbacks.token_filter.fin(ctx, data); |
365 | } |
366 | GRN_FREE(token_cursor->token_filter.data); |
367 | } |
368 | |
369 | grn_rc |
370 | grn_token_cursor_close(grn_ctx *ctx, grn_token_cursor *token_cursor) |
371 | { |
372 | if (token_cursor) { |
373 | if (token_cursor->tokenizer) { |
374 | ((grn_proc *)token_cursor->tokenizer)->funcs[PROC_FIN](ctx, 1, &token_cursor->table, |
375 | &token_cursor->pctx.user_data); |
376 | } |
377 | grn_token_cursor_close_token_filters(ctx, token_cursor); |
378 | if (token_cursor->nstr) { |
379 | grn_obj_close(ctx, token_cursor->nstr); |
380 | } |
381 | GRN_FREE(token_cursor); |
382 | return GRN_SUCCESS; |
383 | } else { |
384 | return GRN_INVALID_ARGUMENT; |
385 | } |
386 | } |
387 | |