| 1 | /* -*- c-basic-offset: 2 -*- */ | 
| 2 | /* | 
| 3 |   Copyright(C) 2012-2014 Brazil | 
| 4 |  | 
| 5 |   This library is free software; you can redistribute it and/or | 
| 6 |   modify it under the terms of the GNU Lesser General Public | 
| 7 |   License version 2.1 as published by the Free Software Foundation. | 
| 8 |  | 
| 9 |   This library is distributed in the hope that it will be useful, | 
| 10 |   but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 11 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
| 12 |   Lesser General Public License for more details. | 
| 13 |  | 
| 14 |   You should have received a copy of the GNU Lesser General Public | 
| 15 |   License along with this library; if not, write to the Free Software | 
| 16 |   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA | 
| 17 | */ | 
| 18 | #include "grn.h" | 
| 19 | #include <groonga/tokenizer.h> | 
| 20 |  | 
| 21 | #include <string.h> | 
| 22 |  | 
| 23 | #include "grn_ctx.h" | 
| 24 | #include "grn_db.h" | 
| 25 | #include "grn_str.h" | 
| 26 | #include "grn_string.h" | 
| 27 | #include "grn_token_cursor.h" | 
| 28 |  | 
| 29 | /* | 
| 30 |   Just for backward compatibility. See grn_plugin_charlen() instead. | 
| 31 |  */ | 
| 32 | int | 
| 33 | grn_tokenizer_charlen(grn_ctx *ctx, const char *str_ptr, | 
| 34 |                       unsigned int str_length, grn_encoding encoding) | 
| 35 | { | 
| 36 |   return grn_plugin_charlen(ctx, str_ptr, str_length, encoding); | 
| 37 | } | 
| 38 |  | 
| 39 | /* | 
| 40 |   Just for backward compatibility. See grn_plugin_isspace() instead. | 
| 41 |  */ | 
| 42 | int | 
| 43 | grn_tokenizer_isspace(grn_ctx *ctx, const char *str_ptr, | 
| 44 |                       unsigned int str_length, grn_encoding encoding) | 
| 45 | { | 
| 46 |   return grn_plugin_isspace(ctx, str_ptr, str_length, encoding); | 
| 47 | } | 
| 48 |  | 
| 49 | grn_bool | 
| 50 | grn_tokenizer_is_tokenized_delimiter(grn_ctx *ctx, | 
| 51 |                                      const char *str_ptr, | 
| 52 |                                      unsigned int str_length, | 
| 53 |                                      grn_encoding encoding) | 
| 54 | { | 
| 55 |   if (encoding != GRN_ENC_UTF8) { | 
| 56 |     return GRN_FALSE; | 
| 57 |   } | 
| 58 |  | 
| 59 |   if (str_length != GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN) { | 
| 60 |     return GRN_FALSE; | 
| 61 |   } | 
| 62 |  | 
| 63 |   return memcmp(str_ptr, | 
| 64 |                 GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8, | 
| 65 |                 GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN) == 0; | 
| 66 | } | 
| 67 |  | 
| 68 | grn_bool | 
| 69 | grn_tokenizer_have_tokenized_delimiter(grn_ctx *ctx, | 
| 70 |                                        const char *str_ptr, | 
| 71 |                                        unsigned int str_length, | 
| 72 |                                        grn_encoding encoding) | 
| 73 | { | 
| 74 |   int char_length; | 
| 75 |   const char *current = str_ptr; | 
| 76 |   const char *end = str_ptr + str_length; | 
| 77 |  | 
| 78 |   if (encoding != GRN_ENC_UTF8) { | 
| 79 |     return GRN_FALSE; | 
| 80 |   } | 
| 81 |  | 
| 82 |   if (str_length == 0) { | 
| 83 |     return GRN_FALSE; | 
| 84 |   } | 
| 85 |  | 
| 86 |   while ((char_length = grn_charlen_(ctx, current, end, encoding)) > 0) { | 
| 87 |     if (grn_tokenizer_is_tokenized_delimiter(ctx, | 
| 88 |                                              current, char_length, | 
| 89 |                                              encoding)) { | 
| 90 |       return GRN_TRUE; | 
| 91 |     } | 
| 92 |     current += char_length; | 
| 93 |   } | 
| 94 |   return GRN_FALSE; | 
| 95 | } | 
| 96 |  | 
| 97 | grn_tokenizer_query * | 
| 98 | grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args, | 
| 99 |                          unsigned int normalize_flags) | 
| 100 | { | 
| 101 |   grn_obj *flags = grn_ctx_pop(ctx); | 
| 102 |   grn_obj *query_str = grn_ctx_pop(ctx); | 
| 103 |   grn_obj *tokenize_mode = grn_ctx_pop(ctx); | 
| 104 |  | 
| 105 |   if (query_str == NULL) { | 
| 106 |     GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument" ); | 
| 107 |     return NULL; | 
| 108 |   } | 
| 109 |  | 
| 110 |   if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) { | 
| 111 |     GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer" ); | 
| 112 |     return NULL; | 
| 113 |   } | 
| 114 |  | 
| 115 |   { | 
| 116 |     grn_tokenizer_query * const query = | 
| 117 |         GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query)); | 
| 118 |     if (query == NULL) { | 
| 119 |       return NULL; | 
| 120 |     } | 
| 121 |     query->normalized_query = NULL; | 
| 122 |     query->query_buf = NULL; | 
| 123 |     if (flags) { | 
| 124 |       query->flags = GRN_UINT32_VALUE(flags); | 
| 125 |     } else { | 
| 126 |       query->flags = 0; | 
| 127 |     } | 
| 128 |     if (tokenize_mode) { | 
| 129 |       query->tokenize_mode = GRN_UINT32_VALUE(tokenize_mode); | 
| 130 |     } else { | 
| 131 |       query->tokenize_mode = GRN_TOKENIZE_ADD; | 
| 132 |     } | 
| 133 |     query->token_mode = query->tokenize_mode; | 
| 134 |  | 
| 135 |     { | 
| 136 |       grn_obj * const table = args[0]; | 
| 137 |       grn_table_flags table_flags; | 
| 138 |       grn_encoding table_encoding; | 
| 139 |       unsigned int query_length = GRN_TEXT_LEN(query_str); | 
| 140 |       char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1); | 
| 141 |       grn_obj *normalizer = NULL; | 
| 142 |  | 
| 143 |       if (query_buf == NULL) { | 
| 144 |         GRN_PLUGIN_FREE(ctx, query); | 
| 145 |         GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, | 
| 146 |                          "[tokenizer] failed to duplicate query" ); | 
| 147 |         return NULL; | 
| 148 |       } | 
| 149 |       grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL, | 
| 150 |                          &normalizer, NULL); | 
| 151 |       { | 
| 152 |         grn_obj *normalized_query; | 
| 153 |         if (table_flags & GRN_OBJ_KEY_NORMALIZE) { | 
| 154 |           normalizer = GRN_NORMALIZER_AUTO; | 
| 155 |         } | 
| 156 |         normalized_query = grn_string_open_(ctx, | 
| 157 |                                             GRN_TEXT_VALUE(query_str), | 
| 158 |                                             GRN_TEXT_LEN(query_str), | 
| 159 |                                             normalizer, | 
| 160 |                                             normalize_flags, | 
| 161 |                                             table_encoding); | 
| 162 |         if (!normalized_query) { | 
| 163 |           GRN_PLUGIN_FREE(ctx, query_buf); | 
| 164 |           GRN_PLUGIN_FREE(ctx, query); | 
| 165 |           GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, | 
| 166 |                            "[tokenizer] failed to open normalized string" ); | 
| 167 |           return NULL; | 
| 168 |         } | 
| 169 |         query->normalized_query = normalized_query; | 
| 170 |         grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length); | 
| 171 |         query_buf[query_length] = '\0'; | 
| 172 |         query->query_buf = query_buf; | 
| 173 |         query->ptr = query_buf; | 
| 174 |         query->length = query_length; | 
| 175 |       } | 
| 176 |       query->encoding = table_encoding; | 
| 177 |  | 
| 178 |       if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) { | 
| 179 |         const char *normalized_string; | 
| 180 |         unsigned int normalized_string_length; | 
| 181 |  | 
| 182 |         grn_string_get_normalized(ctx, | 
| 183 |                                   query->normalized_query, | 
| 184 |                                   &normalized_string, | 
| 185 |                                   &normalized_string_length, | 
| 186 |                                   NULL); | 
| 187 |         query->have_tokenized_delimiter = | 
| 188 |           grn_tokenizer_have_tokenized_delimiter(ctx, | 
| 189 |                                                  normalized_string, | 
| 190 |                                                  normalized_string_length, | 
| 191 |                                                  query->encoding); | 
| 192 |       } else { | 
| 193 |         query->have_tokenized_delimiter = GRN_FALSE; | 
| 194 |       } | 
| 195 |     } | 
| 196 |     return query; | 
| 197 |   } | 
| 198 | } | 
| 199 |  | 
| 200 | grn_tokenizer_query * | 
| 201 | grn_tokenizer_query_create(grn_ctx *ctx, int num_args, grn_obj **args) | 
| 202 | { | 
| 203 |   return grn_tokenizer_query_open(ctx, num_args, args, 0); | 
| 204 | } | 
| 205 |  | 
| 206 | void | 
| 207 | grn_tokenizer_query_close(grn_ctx *ctx, grn_tokenizer_query *query) | 
| 208 | { | 
| 209 |   if (query != NULL) { | 
| 210 |     if (query->normalized_query != NULL) { | 
| 211 |       grn_obj_unlink(ctx, query->normalized_query); | 
| 212 |     } | 
| 213 |     if (query->query_buf != NULL) { | 
| 214 |       GRN_PLUGIN_FREE(ctx, query->query_buf); | 
| 215 |     } | 
| 216 |     GRN_PLUGIN_FREE(ctx, query); | 
| 217 |   } | 
| 218 | } | 
| 219 |  | 
| 220 | void | 
| 221 | grn_tokenizer_query_destroy(grn_ctx *ctx, grn_tokenizer_query *query) | 
| 222 | { | 
| 223 |   grn_tokenizer_query_close(ctx, query); | 
| 224 | } | 
| 225 |  | 
| 226 | void | 
| 227 | grn_tokenizer_token_init(grn_ctx *ctx, grn_tokenizer_token *token) | 
| 228 | { | 
| 229 |   GRN_TEXT_INIT(&token->str, GRN_OBJ_DO_SHALLOW_COPY); | 
| 230 |   GRN_UINT32_INIT(&token->status, 0); | 
| 231 | } | 
| 232 |  | 
| 233 | void | 
| 234 | grn_tokenizer_token_fin(grn_ctx *ctx, grn_tokenizer_token *token) | 
| 235 | { | 
| 236 |   GRN_OBJ_FIN(ctx, &(token->str)); | 
| 237 |   GRN_OBJ_FIN(ctx, &(token->status)); | 
| 238 | } | 
| 239 |  | 
| 240 | void | 
| 241 | grn_tokenizer_token_push(grn_ctx *ctx, grn_tokenizer_token *token, | 
| 242 |                          const char *str_ptr, unsigned int str_length, | 
| 243 |                          grn_token_status status) | 
| 244 | { | 
| 245 |   GRN_TEXT_SET_REF(&token->str, str_ptr, str_length); | 
| 246 |   GRN_UINT32_SET(ctx, &token->status, status); | 
| 247 |   grn_ctx_push(ctx, &token->str); | 
| 248 |   grn_ctx_push(ctx, &token->status); | 
| 249 | } | 
| 250 |  | 
| 251 | const char * | 
| 252 | grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx, | 
| 253 |                                        grn_tokenizer_token *token, | 
| 254 |                                        const char *str_ptr, | 
| 255 |                                        unsigned int str_length, | 
| 256 |                                        grn_encoding encoding) | 
| 257 | { | 
| 258 |   size_t char_length = 0; | 
| 259 |   const char *start = str_ptr; | 
| 260 |   const char *current; | 
| 261 |   const char *end = str_ptr + str_length; | 
| 262 |   const char *next_start = NULL; | 
| 263 |   unsigned int token_length; | 
| 264 |   grn_token_status status; | 
| 265 |  | 
| 266 |   for (current = start; current < end; current += char_length) { | 
| 267 |     char_length = grn_charlen_(ctx, current, end, encoding); | 
| 268 |     if (char_length == 0) { | 
| 269 |       break; | 
| 270 |     } | 
| 271 |     if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length, | 
| 272 |                                              encoding)) { | 
| 273 |       next_start = str_ptr + (current - start + char_length); | 
| 274 |       break; | 
| 275 |     } | 
| 276 |   } | 
| 277 |  | 
| 278 |   token_length = current - start; | 
| 279 |   if (current == end) { | 
| 280 |     status = GRN_TOKENIZER_LAST; | 
| 281 |   } else { | 
| 282 |     status = GRN_TOKENIZER_CONTINUE; | 
| 283 |   } | 
| 284 |   grn_tokenizer_token_push(ctx, token, start, token_length, status); | 
| 285 |  | 
| 286 |   return next_start; | 
| 287 | } | 
| 288 |  | 
| 289 | grn_rc | 
| 290 | grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr, | 
| 291 |                        unsigned int plugin_name_length, | 
| 292 |                        grn_proc_func *init, grn_proc_func *next, | 
| 293 |                        grn_proc_func *fin) | 
| 294 | { | 
| 295 |   grn_expr_var vars[] = { | 
| 296 |     { NULL, 0 }, | 
| 297 |     { NULL, 0 }, | 
| 298 |     { NULL, 0 } | 
| 299 |   }; | 
| 300 |   GRN_TEXT_INIT(&vars[0].value, 0); | 
| 301 |   GRN_TEXT_INIT(&vars[1].value, 0); | 
| 302 |   GRN_UINT32_INIT(&vars[2].value, 0); | 
| 303 |  | 
| 304 |   { | 
| 305 |     /* | 
| 306 |       grn_proc_create() registers a plugin to the database which is associated | 
| 307 |       with `ctx'. A returned object must not be finalized here. | 
| 308 |      */ | 
| 309 |     grn_obj * const obj = grn_proc_create(ctx, plugin_name_ptr, | 
| 310 |                                           plugin_name_length, | 
| 311 |                                           GRN_PROC_TOKENIZER, | 
| 312 |                                           init, next, fin, 3, vars); | 
| 313 |     if (obj == NULL) { | 
| 314 |       GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "grn_proc_create() failed" ); | 
| 315 |       return ctx->rc; | 
| 316 |     } | 
| 317 |   } | 
| 318 |   return GRN_SUCCESS; | 
| 319 | } | 
| 320 |  | 
| 321 | grn_obj * | 
| 322 | grn_token_get_data(grn_ctx *ctx, grn_token *token) | 
| 323 | { | 
| 324 |   GRN_API_ENTER; | 
| 325 |   if (!token) { | 
| 326 |     ERR(GRN_INVALID_ARGUMENT, "token must not be NULL" ); | 
| 327 |     GRN_API_RETURN(NULL); | 
| 328 |   } | 
| 329 |   GRN_API_RETURN(&(token->data)); | 
| 330 | } | 
| 331 |  | 
| 332 | grn_rc | 
| 333 | grn_token_set_data(grn_ctx *ctx, | 
| 334 |                    grn_token *token, | 
| 335 |                    const char *str_ptr, | 
| 336 |                    int str_length) | 
| 337 | { | 
| 338 |   GRN_API_ENTER; | 
| 339 |   if (!token) { | 
| 340 |     ERR(GRN_INVALID_ARGUMENT, "token must not be NULL" ); | 
| 341 |     goto exit; | 
| 342 |   } | 
| 343 |   if (str_length == -1) { | 
| 344 |     str_length = strlen(str_ptr); | 
| 345 |   } | 
| 346 |   GRN_TEXT_SET(ctx, &(token->data), str_ptr, str_length); | 
| 347 | exit: | 
| 348 |   GRN_API_RETURN(ctx->rc); | 
| 349 | } | 
| 350 |  | 
| 351 | grn_token_status | 
| 352 | grn_token_get_status(grn_ctx *ctx, grn_token *token) | 
| 353 | { | 
| 354 |   GRN_API_ENTER; | 
| 355 |   if (!token) { | 
| 356 |     ERR(GRN_INVALID_ARGUMENT, "token must not be NULL" ); | 
| 357 |     GRN_API_RETURN(GRN_TOKEN_CONTINUE); | 
| 358 |   } | 
| 359 |   GRN_API_RETURN(token->status); | 
| 360 | } | 
| 361 |  | 
| 362 | grn_rc | 
| 363 | grn_token_set_status(grn_ctx *ctx, | 
| 364 |                      grn_token *token, | 
| 365 |                      grn_token_status status) | 
| 366 | { | 
| 367 |   GRN_API_ENTER; | 
| 368 |   if (!token) { | 
| 369 |     ERR(GRN_INVALID_ARGUMENT, "token must not be NULL" ); | 
| 370 |     goto exit; | 
| 371 |   } | 
| 372 |   token->status = status; | 
| 373 | exit: | 
| 374 |   GRN_API_RETURN(ctx->rc); | 
| 375 | } | 
| 376 |  |