| 1 | /* -*- c-basic-offset: 2 -*- */ |
| 2 | /* |
| 3 | Copyright(C) 2012-2014 Brazil |
| 4 | |
| 5 | This library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License version 2.1 as published by the Free Software Foundation. |
| 8 | |
| 9 | This library is distributed in the hope that it will be useful, |
| 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 12 | Lesser General Public License for more details. |
| 13 | |
| 14 | You should have received a copy of the GNU Lesser General Public |
| 15 | License along with this library; if not, write to the Free Software |
| 16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 17 | */ |
| 18 | #include "grn.h" |
| 19 | #include <groonga/tokenizer.h> |
| 20 | |
| 21 | #include <string.h> |
| 22 | |
| 23 | #include "grn_ctx.h" |
| 24 | #include "grn_db.h" |
| 25 | #include "grn_str.h" |
| 26 | #include "grn_string.h" |
| 27 | #include "grn_token_cursor.h" |
| 28 | |
| 29 | /* |
| 30 | Just for backward compatibility. See grn_plugin_charlen() instead. |
| 31 | */ |
| 32 | int |
| 33 | grn_tokenizer_charlen(grn_ctx *ctx, const char *str_ptr, |
| 34 | unsigned int str_length, grn_encoding encoding) |
| 35 | { |
| 36 | return grn_plugin_charlen(ctx, str_ptr, str_length, encoding); |
| 37 | } |
| 38 | |
| 39 | /* |
| 40 | Just for backward compatibility. See grn_plugin_isspace() instead. |
| 41 | */ |
| 42 | int |
| 43 | grn_tokenizer_isspace(grn_ctx *ctx, const char *str_ptr, |
| 44 | unsigned int str_length, grn_encoding encoding) |
| 45 | { |
| 46 | return grn_plugin_isspace(ctx, str_ptr, str_length, encoding); |
| 47 | } |
| 48 | |
| 49 | grn_bool |
| 50 | grn_tokenizer_is_tokenized_delimiter(grn_ctx *ctx, |
| 51 | const char *str_ptr, |
| 52 | unsigned int str_length, |
| 53 | grn_encoding encoding) |
| 54 | { |
| 55 | if (encoding != GRN_ENC_UTF8) { |
| 56 | return GRN_FALSE; |
| 57 | } |
| 58 | |
| 59 | if (str_length != GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN) { |
| 60 | return GRN_FALSE; |
| 61 | } |
| 62 | |
| 63 | return memcmp(str_ptr, |
| 64 | GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8, |
| 65 | GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN) == 0; |
| 66 | } |
| 67 | |
| 68 | grn_bool |
| 69 | grn_tokenizer_have_tokenized_delimiter(grn_ctx *ctx, |
| 70 | const char *str_ptr, |
| 71 | unsigned int str_length, |
| 72 | grn_encoding encoding) |
| 73 | { |
| 74 | int char_length; |
| 75 | const char *current = str_ptr; |
| 76 | const char *end = str_ptr + str_length; |
| 77 | |
| 78 | if (encoding != GRN_ENC_UTF8) { |
| 79 | return GRN_FALSE; |
| 80 | } |
| 81 | |
| 82 | if (str_length == 0) { |
| 83 | return GRN_FALSE; |
| 84 | } |
| 85 | |
| 86 | while ((char_length = grn_charlen_(ctx, current, end, encoding)) > 0) { |
| 87 | if (grn_tokenizer_is_tokenized_delimiter(ctx, |
| 88 | current, char_length, |
| 89 | encoding)) { |
| 90 | return GRN_TRUE; |
| 91 | } |
| 92 | current += char_length; |
| 93 | } |
| 94 | return GRN_FALSE; |
| 95 | } |
| 96 | |
| 97 | grn_tokenizer_query * |
| 98 | grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args, |
| 99 | unsigned int normalize_flags) |
| 100 | { |
| 101 | grn_obj *flags = grn_ctx_pop(ctx); |
| 102 | grn_obj *query_str = grn_ctx_pop(ctx); |
| 103 | grn_obj *tokenize_mode = grn_ctx_pop(ctx); |
| 104 | |
| 105 | if (query_str == NULL) { |
| 106 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument" ); |
| 107 | return NULL; |
| 108 | } |
| 109 | |
| 110 | if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) { |
| 111 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer" ); |
| 112 | return NULL; |
| 113 | } |
| 114 | |
| 115 | { |
| 116 | grn_tokenizer_query * const query = |
| 117 | GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query)); |
| 118 | if (query == NULL) { |
| 119 | return NULL; |
| 120 | } |
| 121 | query->normalized_query = NULL; |
| 122 | query->query_buf = NULL; |
| 123 | if (flags) { |
| 124 | query->flags = GRN_UINT32_VALUE(flags); |
| 125 | } else { |
| 126 | query->flags = 0; |
| 127 | } |
| 128 | if (tokenize_mode) { |
| 129 | query->tokenize_mode = GRN_UINT32_VALUE(tokenize_mode); |
| 130 | } else { |
| 131 | query->tokenize_mode = GRN_TOKENIZE_ADD; |
| 132 | } |
| 133 | query->token_mode = query->tokenize_mode; |
| 134 | |
| 135 | { |
| 136 | grn_obj * const table = args[0]; |
| 137 | grn_table_flags table_flags; |
| 138 | grn_encoding table_encoding; |
| 139 | unsigned int query_length = GRN_TEXT_LEN(query_str); |
| 140 | char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1); |
| 141 | grn_obj *normalizer = NULL; |
| 142 | |
| 143 | if (query_buf == NULL) { |
| 144 | GRN_PLUGIN_FREE(ctx, query); |
| 145 | GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, |
| 146 | "[tokenizer] failed to duplicate query" ); |
| 147 | return NULL; |
| 148 | } |
| 149 | grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL, |
| 150 | &normalizer, NULL); |
| 151 | { |
| 152 | grn_obj *normalized_query; |
| 153 | if (table_flags & GRN_OBJ_KEY_NORMALIZE) { |
| 154 | normalizer = GRN_NORMALIZER_AUTO; |
| 155 | } |
| 156 | normalized_query = grn_string_open_(ctx, |
| 157 | GRN_TEXT_VALUE(query_str), |
| 158 | GRN_TEXT_LEN(query_str), |
| 159 | normalizer, |
| 160 | normalize_flags, |
| 161 | table_encoding); |
| 162 | if (!normalized_query) { |
| 163 | GRN_PLUGIN_FREE(ctx, query_buf); |
| 164 | GRN_PLUGIN_FREE(ctx, query); |
| 165 | GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, |
| 166 | "[tokenizer] failed to open normalized string" ); |
| 167 | return NULL; |
| 168 | } |
| 169 | query->normalized_query = normalized_query; |
| 170 | grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length); |
| 171 | query_buf[query_length] = '\0'; |
| 172 | query->query_buf = query_buf; |
| 173 | query->ptr = query_buf; |
| 174 | query->length = query_length; |
| 175 | } |
| 176 | query->encoding = table_encoding; |
| 177 | |
| 178 | if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) { |
| 179 | const char *normalized_string; |
| 180 | unsigned int normalized_string_length; |
| 181 | |
| 182 | grn_string_get_normalized(ctx, |
| 183 | query->normalized_query, |
| 184 | &normalized_string, |
| 185 | &normalized_string_length, |
| 186 | NULL); |
| 187 | query->have_tokenized_delimiter = |
| 188 | grn_tokenizer_have_tokenized_delimiter(ctx, |
| 189 | normalized_string, |
| 190 | normalized_string_length, |
| 191 | query->encoding); |
| 192 | } else { |
| 193 | query->have_tokenized_delimiter = GRN_FALSE; |
| 194 | } |
| 195 | } |
| 196 | return query; |
| 197 | } |
| 198 | } |
| 199 | |
| 200 | grn_tokenizer_query * |
| 201 | grn_tokenizer_query_create(grn_ctx *ctx, int num_args, grn_obj **args) |
| 202 | { |
| 203 | return grn_tokenizer_query_open(ctx, num_args, args, 0); |
| 204 | } |
| 205 | |
| 206 | void |
| 207 | grn_tokenizer_query_close(grn_ctx *ctx, grn_tokenizer_query *query) |
| 208 | { |
| 209 | if (query != NULL) { |
| 210 | if (query->normalized_query != NULL) { |
| 211 | grn_obj_unlink(ctx, query->normalized_query); |
| 212 | } |
| 213 | if (query->query_buf != NULL) { |
| 214 | GRN_PLUGIN_FREE(ctx, query->query_buf); |
| 215 | } |
| 216 | GRN_PLUGIN_FREE(ctx, query); |
| 217 | } |
| 218 | } |
| 219 | |
| 220 | void |
| 221 | grn_tokenizer_query_destroy(grn_ctx *ctx, grn_tokenizer_query *query) |
| 222 | { |
| 223 | grn_tokenizer_query_close(ctx, query); |
| 224 | } |
| 225 | |
| 226 | void |
| 227 | grn_tokenizer_token_init(grn_ctx *ctx, grn_tokenizer_token *token) |
| 228 | { |
| 229 | GRN_TEXT_INIT(&token->str, GRN_OBJ_DO_SHALLOW_COPY); |
| 230 | GRN_UINT32_INIT(&token->status, 0); |
| 231 | } |
| 232 | |
| 233 | void |
| 234 | grn_tokenizer_token_fin(grn_ctx *ctx, grn_tokenizer_token *token) |
| 235 | { |
| 236 | GRN_OBJ_FIN(ctx, &(token->str)); |
| 237 | GRN_OBJ_FIN(ctx, &(token->status)); |
| 238 | } |
| 239 | |
| 240 | void |
| 241 | grn_tokenizer_token_push(grn_ctx *ctx, grn_tokenizer_token *token, |
| 242 | const char *str_ptr, unsigned int str_length, |
| 243 | grn_token_status status) |
| 244 | { |
| 245 | GRN_TEXT_SET_REF(&token->str, str_ptr, str_length); |
| 246 | GRN_UINT32_SET(ctx, &token->status, status); |
| 247 | grn_ctx_push(ctx, &token->str); |
| 248 | grn_ctx_push(ctx, &token->status); |
| 249 | } |
| 250 | |
| 251 | const char * |
| 252 | grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx, |
| 253 | grn_tokenizer_token *token, |
| 254 | const char *str_ptr, |
| 255 | unsigned int str_length, |
| 256 | grn_encoding encoding) |
| 257 | { |
| 258 | size_t char_length = 0; |
| 259 | const char *start = str_ptr; |
| 260 | const char *current; |
| 261 | const char *end = str_ptr + str_length; |
| 262 | const char *next_start = NULL; |
| 263 | unsigned int token_length; |
| 264 | grn_token_status status; |
| 265 | |
| 266 | for (current = start; current < end; current += char_length) { |
| 267 | char_length = grn_charlen_(ctx, current, end, encoding); |
| 268 | if (char_length == 0) { |
| 269 | break; |
| 270 | } |
| 271 | if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length, |
| 272 | encoding)) { |
| 273 | next_start = str_ptr + (current - start + char_length); |
| 274 | break; |
| 275 | } |
| 276 | } |
| 277 | |
| 278 | token_length = current - start; |
| 279 | if (current == end) { |
| 280 | status = GRN_TOKENIZER_LAST; |
| 281 | } else { |
| 282 | status = GRN_TOKENIZER_CONTINUE; |
| 283 | } |
| 284 | grn_tokenizer_token_push(ctx, token, start, token_length, status); |
| 285 | |
| 286 | return next_start; |
| 287 | } |
| 288 | |
| 289 | grn_rc |
| 290 | grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr, |
| 291 | unsigned int plugin_name_length, |
| 292 | grn_proc_func *init, grn_proc_func *next, |
| 293 | grn_proc_func *fin) |
| 294 | { |
| 295 | grn_expr_var vars[] = { |
| 296 | { NULL, 0 }, |
| 297 | { NULL, 0 }, |
| 298 | { NULL, 0 } |
| 299 | }; |
| 300 | GRN_TEXT_INIT(&vars[0].value, 0); |
| 301 | GRN_TEXT_INIT(&vars[1].value, 0); |
| 302 | GRN_UINT32_INIT(&vars[2].value, 0); |
| 303 | |
| 304 | { |
| 305 | /* |
| 306 | grn_proc_create() registers a plugin to the database which is associated |
| 307 | with `ctx'. A returned object must not be finalized here. |
| 308 | */ |
| 309 | grn_obj * const obj = grn_proc_create(ctx, plugin_name_ptr, |
| 310 | plugin_name_length, |
| 311 | GRN_PROC_TOKENIZER, |
| 312 | init, next, fin, 3, vars); |
| 313 | if (obj == NULL) { |
| 314 | GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "grn_proc_create() failed" ); |
| 315 | return ctx->rc; |
| 316 | } |
| 317 | } |
| 318 | return GRN_SUCCESS; |
| 319 | } |
| 320 | |
| 321 | grn_obj * |
| 322 | grn_token_get_data(grn_ctx *ctx, grn_token *token) |
| 323 | { |
| 324 | GRN_API_ENTER; |
| 325 | if (!token) { |
| 326 | ERR(GRN_INVALID_ARGUMENT, "token must not be NULL" ); |
| 327 | GRN_API_RETURN(NULL); |
| 328 | } |
| 329 | GRN_API_RETURN(&(token->data)); |
| 330 | } |
| 331 | |
| 332 | grn_rc |
| 333 | grn_token_set_data(grn_ctx *ctx, |
| 334 | grn_token *token, |
| 335 | const char *str_ptr, |
| 336 | int str_length) |
| 337 | { |
| 338 | GRN_API_ENTER; |
| 339 | if (!token) { |
| 340 | ERR(GRN_INVALID_ARGUMENT, "token must not be NULL" ); |
| 341 | goto exit; |
| 342 | } |
| 343 | if (str_length == -1) { |
| 344 | str_length = strlen(str_ptr); |
| 345 | } |
| 346 | GRN_TEXT_SET(ctx, &(token->data), str_ptr, str_length); |
| 347 | exit: |
| 348 | GRN_API_RETURN(ctx->rc); |
| 349 | } |
| 350 | |
| 351 | grn_token_status |
| 352 | grn_token_get_status(grn_ctx *ctx, grn_token *token) |
| 353 | { |
| 354 | GRN_API_ENTER; |
| 355 | if (!token) { |
| 356 | ERR(GRN_INVALID_ARGUMENT, "token must not be NULL" ); |
| 357 | GRN_API_RETURN(GRN_TOKEN_CONTINUE); |
| 358 | } |
| 359 | GRN_API_RETURN(token->status); |
| 360 | } |
| 361 | |
| 362 | grn_rc |
| 363 | grn_token_set_status(grn_ctx *ctx, |
| 364 | grn_token *token, |
| 365 | grn_token_status status) |
| 366 | { |
| 367 | GRN_API_ENTER; |
| 368 | if (!token) { |
| 369 | ERR(GRN_INVALID_ARGUMENT, "token must not be NULL" ); |
| 370 | goto exit; |
| 371 | } |
| 372 | token->status = status; |
| 373 | exit: |
| 374 | GRN_API_RETURN(ctx->rc); |
| 375 | } |
| 376 | |