| 1 | /* -*- c-basic-offset: 2 -*- */ |
| 2 | /* |
| 3 | Copyright(C) 2009-2017 Brazil |
| 4 | |
| 5 | This library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License version 2.1 as published by the Free Software Foundation. |
| 8 | |
| 9 | This library is distributed in the hope that it will be useful, |
| 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 12 | Lesser General Public License for more details. |
| 13 | |
| 14 | You should have received a copy of the GNU Lesser General Public |
| 15 | License along with this library; if not, write to the Free Software |
| 16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 17 | */ |
| 18 | #include "grn_token_cursor.h" |
| 19 | #include "grn_string.h" |
| 20 | #include "grn_pat.h" |
| 21 | #include "grn_dat.h" |
| 22 | |
| 23 | static void |
| 24 | grn_token_cursor_open_initialize_token_filters(grn_ctx *ctx, |
| 25 | grn_token_cursor *token_cursor) |
| 26 | { |
| 27 | grn_obj *token_filters = token_cursor->token_filter.objects; |
| 28 | unsigned int i, n_token_filters; |
| 29 | |
| 30 | token_cursor->token_filter.data = NULL; |
| 31 | |
| 32 | if (token_filters) { |
| 33 | n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *); |
| 34 | } else { |
| 35 | n_token_filters = 0; |
| 36 | } |
| 37 | |
| 38 | if (n_token_filters == 0) { |
| 39 | return; |
| 40 | } |
| 41 | |
| 42 | token_cursor->token_filter.data = GRN_CALLOC(sizeof(void *) * n_token_filters); |
| 43 | if (!token_cursor->token_filter.data) { |
| 44 | return; |
| 45 | } |
| 46 | |
| 47 | for (i = 0; i < n_token_filters; i++) { |
| 48 | grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i); |
| 49 | grn_proc *token_filter = (grn_proc *)token_filter_object; |
| 50 | |
| 51 | token_cursor->token_filter.data[i] = |
| 52 | token_filter->callbacks.token_filter.init(ctx, |
| 53 | token_cursor->table, |
| 54 | token_cursor->mode); |
| 55 | } |
| 56 | } |
| 57 | |
| 58 | grn_token_cursor * |
| 59 | grn_token_cursor_open(grn_ctx *ctx, grn_obj *table, |
| 60 | const char *str, size_t str_len, |
| 61 | grn_tokenize_mode mode, unsigned int flags) |
| 62 | { |
| 63 | grn_token_cursor *token_cursor; |
| 64 | grn_encoding encoding; |
| 65 | grn_obj *tokenizer; |
| 66 | grn_obj *normalizer; |
| 67 | grn_obj *token_filters; |
| 68 | grn_table_flags table_flags; |
| 69 | if (grn_table_get_info(ctx, table, &table_flags, &encoding, &tokenizer, |
| 70 | &normalizer, &token_filters)) { |
| 71 | return NULL; |
| 72 | } |
| 73 | if (!(token_cursor = GRN_MALLOC(sizeof(grn_token_cursor)))) { return NULL; } |
| 74 | token_cursor->table = table; |
| 75 | token_cursor->mode = mode; |
| 76 | token_cursor->encoding = encoding; |
| 77 | token_cursor->tokenizer = tokenizer; |
| 78 | token_cursor->token_filter.objects = token_filters; |
| 79 | token_cursor->token_filter.data = NULL; |
| 80 | token_cursor->orig = (const unsigned char *)str; |
| 81 | token_cursor->orig_blen = str_len; |
| 82 | token_cursor->curr = NULL; |
| 83 | token_cursor->nstr = NULL; |
| 84 | token_cursor->curr_size = 0; |
| 85 | token_cursor->pos = -1; |
| 86 | token_cursor->status = GRN_TOKEN_CURSOR_DOING; |
| 87 | token_cursor->force_prefix = GRN_FALSE; |
| 88 | if (tokenizer) { |
| 89 | grn_obj str_, flags_, mode_; |
| 90 | GRN_TEXT_INIT(&str_, GRN_OBJ_DO_SHALLOW_COPY); |
| 91 | GRN_TEXT_SET_REF(&str_, str, str_len); |
| 92 | GRN_UINT32_INIT(&flags_, 0); |
| 93 | GRN_UINT32_SET(ctx, &flags_, flags); |
| 94 | GRN_UINT32_INIT(&mode_, 0); |
| 95 | GRN_UINT32_SET(ctx, &mode_, mode); |
| 96 | token_cursor->pctx.caller = NULL; |
| 97 | token_cursor->pctx.user_data.ptr = NULL; |
| 98 | token_cursor->pctx.proc = (grn_proc *)tokenizer; |
| 99 | token_cursor->pctx.hooks = NULL; |
| 100 | token_cursor->pctx.currh = NULL; |
| 101 | token_cursor->pctx.phase = PROC_INIT; |
| 102 | grn_ctx_push(ctx, &mode_); |
| 103 | grn_ctx_push(ctx, &str_); |
| 104 | grn_ctx_push(ctx, &flags_); |
| 105 | ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token_cursor->pctx.user_data); |
| 106 | grn_obj_close(ctx, &flags_); |
| 107 | grn_obj_close(ctx, &str_); |
| 108 | grn_obj_close(ctx, &mode_); |
| 109 | } else { |
| 110 | int nflags = 0; |
| 111 | token_cursor->nstr = grn_string_open_(ctx, str, str_len, |
| 112 | normalizer, |
| 113 | nflags, |
| 114 | token_cursor->encoding); |
| 115 | if (token_cursor->nstr) { |
| 116 | const char *normalized; |
| 117 | grn_string_get_normalized(ctx, token_cursor->nstr, |
| 118 | &normalized, &(token_cursor->curr_size), NULL); |
| 119 | token_cursor->curr = (const unsigned char *)normalized; |
| 120 | } else { |
| 121 | ERR(GRN_TOKENIZER_ERROR, |
| 122 | "[token-cursor][open] failed to grn_string_open()" ); |
| 123 | } |
| 124 | } |
| 125 | |
| 126 | if (ctx->rc == GRN_SUCCESS) { |
| 127 | grn_token_cursor_open_initialize_token_filters(ctx, token_cursor); |
| 128 | } |
| 129 | |
| 130 | if (ctx->rc) { |
| 131 | grn_token_cursor_close(ctx, token_cursor); |
| 132 | token_cursor = NULL; |
| 133 | } |
| 134 | return token_cursor; |
| 135 | } |
| 136 | |
| 137 | static int |
| 138 | grn_token_cursor_next_apply_token_filters(grn_ctx *ctx, |
| 139 | grn_token_cursor *token_cursor, |
| 140 | grn_obj *current_token_data, |
| 141 | grn_obj *status) |
| 142 | { |
| 143 | grn_obj *token_filters = token_cursor->token_filter.objects; |
| 144 | unsigned int i, n_token_filters; |
| 145 | grn_token current_token; |
| 146 | grn_token next_token; |
| 147 | |
| 148 | if (token_filters) { |
| 149 | n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *); |
| 150 | } else { |
| 151 | n_token_filters = 0; |
| 152 | } |
| 153 | |
| 154 | GRN_TEXT_INIT(&(current_token.data), GRN_OBJ_DO_SHALLOW_COPY); |
| 155 | GRN_TEXT_SET(ctx, &(current_token.data), |
| 156 | GRN_TEXT_VALUE(current_token_data), |
| 157 | GRN_TEXT_LEN(current_token_data)); |
| 158 | current_token.status = GRN_INT32_VALUE(status); |
| 159 | GRN_TEXT_INIT(&(next_token.data), GRN_OBJ_DO_SHALLOW_COPY); |
| 160 | GRN_TEXT_SET(ctx, &(next_token.data), |
| 161 | GRN_TEXT_VALUE(&(current_token.data)), |
| 162 | GRN_TEXT_LEN(&(current_token.data))); |
| 163 | next_token.status = current_token.status; |
| 164 | |
| 165 | for (i = 0; i < n_token_filters; i++) { |
| 166 | grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i); |
| 167 | grn_proc *token_filter = (grn_proc *)token_filter_object; |
| 168 | void *data = token_cursor->token_filter.data[i]; |
| 169 | |
| 170 | #define SKIP_FLAGS\ |
| 171 | (GRN_TOKEN_SKIP |\ |
| 172 | GRN_TOKEN_SKIP_WITH_POSITION) |
| 173 | if (current_token.status & SKIP_FLAGS) { |
| 174 | break; |
| 175 | } |
| 176 | #undef SKIP_FLAGS |
| 177 | |
| 178 | token_filter->callbacks.token_filter.filter(ctx, |
| 179 | ¤t_token, |
| 180 | &next_token, |
| 181 | data); |
| 182 | GRN_TEXT_SET(ctx, &(current_token.data), |
| 183 | GRN_TEXT_VALUE(&(next_token.data)), |
| 184 | GRN_TEXT_LEN(&(next_token.data))); |
| 185 | current_token.status = next_token.status; |
| 186 | } |
| 187 | |
| 188 | token_cursor->curr = |
| 189 | (const unsigned char *)GRN_TEXT_VALUE(&(current_token.data)); |
| 190 | token_cursor->curr_size = GRN_TEXT_LEN(&(current_token.data)); |
| 191 | |
| 192 | return current_token.status; |
| 193 | } |
| 194 | |
| 195 | grn_id |
| 196 | grn_token_cursor_next(grn_ctx *ctx, grn_token_cursor *token_cursor) |
| 197 | { |
| 198 | int status; |
| 199 | grn_id tid = GRN_ID_NIL; |
| 200 | grn_obj *table = token_cursor->table; |
| 201 | grn_obj *tokenizer = token_cursor->tokenizer; |
| 202 | while (token_cursor->status != GRN_TOKEN_CURSOR_DONE) { |
| 203 | if (tokenizer) { |
| 204 | grn_obj *curr_, *stat_; |
| 205 | ((grn_proc *)tokenizer)->funcs[PROC_NEXT](ctx, 1, &table, &token_cursor->pctx.user_data); |
| 206 | stat_ = grn_ctx_pop(ctx); |
| 207 | curr_ = grn_ctx_pop(ctx); |
| 208 | status = grn_token_cursor_next_apply_token_filters(ctx, token_cursor, |
| 209 | curr_, stat_); |
| 210 | token_cursor->status = |
| 211 | ((status & GRN_TOKEN_LAST) || |
| 212 | (token_cursor->mode == GRN_TOKENIZE_GET && |
| 213 | (status & GRN_TOKEN_REACH_END))) |
| 214 | ? GRN_TOKEN_CURSOR_DONE : GRN_TOKEN_CURSOR_DOING; |
| 215 | token_cursor->force_prefix = GRN_FALSE; |
| 216 | #define SKIP_FLAGS \ |
| 217 | (GRN_TOKEN_SKIP | GRN_TOKEN_SKIP_WITH_POSITION) |
| 218 | if (status & SKIP_FLAGS) { |
| 219 | if (status & GRN_TOKEN_SKIP) { |
| 220 | token_cursor->pos++; |
| 221 | } |
| 222 | if (token_cursor->status == GRN_TOKEN_CURSOR_DONE && tid == GRN_ID_NIL) { |
| 223 | token_cursor->status = GRN_TOKEN_CURSOR_DONE_SKIP; |
| 224 | break; |
| 225 | } else { |
| 226 | continue; |
| 227 | } |
| 228 | } |
| 229 | #undef SKIP_FLAGS |
| 230 | if (status & GRN_TOKEN_FORCE_PREFIX) { |
| 231 | token_cursor->force_prefix = GRN_TRUE; |
| 232 | } |
| 233 | if (token_cursor->curr_size == 0) { |
| 234 | if (token_cursor->status != GRN_TOKEN_CURSOR_DONE) { |
| 235 | char tokenizer_name[GRN_TABLE_MAX_KEY_SIZE]; |
| 236 | int tokenizer_name_length; |
| 237 | tokenizer_name_length = |
| 238 | grn_obj_name(ctx, token_cursor->tokenizer, |
| 239 | tokenizer_name, GRN_TABLE_MAX_KEY_SIZE); |
| 240 | GRN_LOG(ctx, GRN_WARN, |
| 241 | "[token_next] ignore an empty token: <%.*s>: <%.*s>" , |
| 242 | tokenizer_name_length, tokenizer_name, |
| 243 | token_cursor->orig_blen, token_cursor->orig); |
| 244 | } |
| 245 | continue; |
| 246 | } |
| 247 | if (token_cursor->curr_size > GRN_TABLE_MAX_KEY_SIZE) { |
| 248 | GRN_LOG(ctx, GRN_WARN, |
| 249 | "[token_next] ignore too long token. " |
| 250 | "Token must be less than or equal to %d: <%d>(<%.*s>)" , |
| 251 | GRN_TABLE_MAX_KEY_SIZE, |
| 252 | token_cursor->curr_size, |
| 253 | token_cursor->curr_size, token_cursor->curr); |
| 254 | continue; |
| 255 | } |
| 256 | if (status & GRN_TOKEN_UNMATURED) { |
| 257 | if (status & GRN_TOKEN_OVERLAP) { |
| 258 | if (token_cursor->mode == GRN_TOKENIZE_GET) { |
| 259 | token_cursor->pos++; |
| 260 | continue; |
| 261 | } |
| 262 | } else { |
| 263 | if (status & GRN_TOKEN_REACH_END) { |
| 264 | token_cursor->force_prefix = GRN_TRUE; |
| 265 | } |
| 266 | } |
| 267 | } |
| 268 | } else { |
| 269 | token_cursor->status = GRN_TOKEN_CURSOR_DONE; |
| 270 | } |
| 271 | if (token_cursor->mode == GRN_TOKENIZE_ADD) { |
| 272 | switch (table->header.type) { |
| 273 | case GRN_TABLE_PAT_KEY : |
| 274 | if (grn_io_lock(ctx, ((grn_pat *)table)->io, grn_lock_timeout)) { |
| 275 | tid = GRN_ID_NIL; |
| 276 | } else { |
| 277 | tid = grn_pat_add(ctx, (grn_pat *)table, token_cursor->curr, token_cursor->curr_size, |
| 278 | NULL, NULL); |
| 279 | grn_io_unlock(((grn_pat *)table)->io); |
| 280 | } |
| 281 | break; |
| 282 | case GRN_TABLE_DAT_KEY : |
| 283 | if (grn_io_lock(ctx, ((grn_dat *)table)->io, grn_lock_timeout)) { |
| 284 | tid = GRN_ID_NIL; |
| 285 | } else { |
| 286 | tid = grn_dat_add(ctx, (grn_dat *)table, token_cursor->curr, token_cursor->curr_size, |
| 287 | NULL, NULL); |
| 288 | grn_io_unlock(((grn_dat *)table)->io); |
| 289 | } |
| 290 | break; |
| 291 | case GRN_TABLE_HASH_KEY : |
| 292 | if (grn_io_lock(ctx, ((grn_hash *)table)->io, grn_lock_timeout)) { |
| 293 | tid = GRN_ID_NIL; |
| 294 | } else { |
| 295 | tid = grn_hash_add(ctx, (grn_hash *)table, token_cursor->curr, token_cursor->curr_size, |
| 296 | NULL, NULL); |
| 297 | grn_io_unlock(((grn_hash *)table)->io); |
| 298 | } |
| 299 | break; |
| 300 | case GRN_TABLE_NO_KEY : |
| 301 | if (token_cursor->curr_size == sizeof(grn_id)) { |
| 302 | tid = *((grn_id *)token_cursor->curr); |
| 303 | } else { |
| 304 | tid = GRN_ID_NIL; |
| 305 | } |
| 306 | break; |
| 307 | } |
| 308 | } else if (token_cursor->mode != GRN_TOKENIZE_ONLY) { |
| 309 | switch (table->header.type) { |
| 310 | case GRN_TABLE_PAT_KEY : |
| 311 | tid = grn_pat_get(ctx, (grn_pat *)table, token_cursor->curr, token_cursor->curr_size, NULL); |
| 312 | break; |
| 313 | case GRN_TABLE_DAT_KEY : |
| 314 | tid = grn_dat_get(ctx, (grn_dat *)table, token_cursor->curr, token_cursor->curr_size, NULL); |
| 315 | break; |
| 316 | case GRN_TABLE_HASH_KEY : |
| 317 | tid = grn_hash_get(ctx, (grn_hash *)table, token_cursor->curr, token_cursor->curr_size, NULL); |
| 318 | break; |
| 319 | case GRN_TABLE_NO_KEY : |
| 320 | if (token_cursor->curr_size == sizeof(grn_id)) { |
| 321 | tid = *((grn_id *)token_cursor->curr); |
| 322 | } else { |
| 323 | tid = GRN_ID_NIL; |
| 324 | } |
| 325 | break; |
| 326 | } |
| 327 | } |
| 328 | if (token_cursor->mode != GRN_TOKENIZE_ONLY && |
| 329 | tid == GRN_ID_NIL && token_cursor->status != GRN_TOKEN_CURSOR_DONE) { |
| 330 | token_cursor->status = GRN_TOKEN_CURSOR_NOT_FOUND; |
| 331 | } |
| 332 | token_cursor->pos++; |
| 333 | break; |
| 334 | } |
| 335 | return tid; |
| 336 | } |
| 337 | |
| 338 | static void |
| 339 | grn_token_cursor_close_token_filters(grn_ctx *ctx, |
| 340 | grn_token_cursor *token_cursor) |
| 341 | { |
| 342 | grn_obj *token_filters = token_cursor->token_filter.objects; |
| 343 | unsigned int i, n_token_filters; |
| 344 | |
| 345 | if (!token_cursor->token_filter.data) { |
| 346 | return; |
| 347 | } |
| 348 | |
| 349 | if (token_filters) { |
| 350 | n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *); |
| 351 | } else { |
| 352 | n_token_filters = 0; |
| 353 | } |
| 354 | |
| 355 | if (n_token_filters == 0) { |
| 356 | return; |
| 357 | } |
| 358 | |
| 359 | for (i = 0; i < n_token_filters; i++) { |
| 360 | grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i); |
| 361 | grn_proc *token_filter = (grn_proc *)token_filter_object; |
| 362 | void *data = token_cursor->token_filter.data[i]; |
| 363 | |
| 364 | token_filter->callbacks.token_filter.fin(ctx, data); |
| 365 | } |
| 366 | GRN_FREE(token_cursor->token_filter.data); |
| 367 | } |
| 368 | |
| 369 | grn_rc |
| 370 | grn_token_cursor_close(grn_ctx *ctx, grn_token_cursor *token_cursor) |
| 371 | { |
| 372 | if (token_cursor) { |
| 373 | if (token_cursor->tokenizer) { |
| 374 | ((grn_proc *)token_cursor->tokenizer)->funcs[PROC_FIN](ctx, 1, &token_cursor->table, |
| 375 | &token_cursor->pctx.user_data); |
| 376 | } |
| 377 | grn_token_cursor_close_token_filters(ctx, token_cursor); |
| 378 | if (token_cursor->nstr) { |
| 379 | grn_obj_close(ctx, token_cursor->nstr); |
| 380 | } |
| 381 | GRN_FREE(token_cursor); |
| 382 | return GRN_SUCCESS; |
| 383 | } else { |
| 384 | return GRN_INVALID_ARGUMENT; |
| 385 | } |
| 386 | } |
| 387 | |