| 1 | /* -*- c-basic-offset: 2 -*- */ |
| 2 | /* Copyright(C) 2009-2014 Brazil |
| 3 | |
| 4 | This library is free software; you can redistribute it and/or |
| 5 | modify it under the terms of the GNU Lesser General Public |
| 6 | License version 2.1 as published by the Free Software Foundation. |
| 7 | |
| 8 | This library is distributed in the hope that it will be useful, |
| 9 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 11 | Lesser General Public License for more details. |
| 12 | |
| 13 | You should have received a copy of the GNU Lesser General Public |
| 14 | License along with this library; if not, write to the Free Software |
| 15 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 16 | */ |
| 17 | #include "grn.h" |
| 18 | #include <string.h> |
| 19 | #include <stddef.h> |
| 20 | #include "grn_snip.h" |
| 21 | #include "grn_ctx.h" |
| 22 | |
| 23 | #if !defined MAX |
| 24 | #define MAX(a, b) ((a) > (b) ? (a) : (b)) |
| 25 | #endif |
| 26 | |
| 27 | #if !defined MIN |
| 28 | #define MIN(a, b) ((a) < (b) ? (a) : (b)) |
| 29 | #endif |
| 30 | |
| 31 | static int |
| 32 | grn_bm_check_euc(const unsigned char *x, const size_t y) |
| 33 | { |
| 34 | const unsigned char *p; |
| 35 | for (p = x + y - 1; p >= x && *p >= 0x80U; p--); |
| 36 | return (int) ((x + y - p) & 1); |
| 37 | } |
| 38 | |
| 39 | static int |
| 40 | grn_bm_check_sjis(const unsigned char *x, const size_t y) |
| 41 | { |
| 42 | const unsigned char *p; |
| 43 | for (p = x + y - 1; p >= x; p--) |
| 44 | if ((*p < 0x81U) || (*p > 0x9fU && *p < 0xe0U) || (*p > 0xfcU)) |
| 45 | break; |
| 46 | return (int) ((x + y - p) & 1); |
| 47 | } |
| 48 | |
| 49 | /* |
| 50 | static void |
| 51 | grn_bm_suffixes(const unsigned char *x, size_t m, size_t *suff) |
| 52 | { |
| 53 | size_t f, g; |
| 54 | intptr_t i; |
| 55 | f = 0; |
| 56 | suff[m - 1] = m; |
| 57 | g = m - 1; |
| 58 | for (i = m - 2; i >= 0; --i) { |
| 59 | if (i > (intptr_t) g && suff[i + m - 1 - f] < i - g) |
| 60 | suff[i] = suff[i + m - 1 - f]; |
| 61 | else { |
| 62 | if (i < (intptr_t) g) |
| 63 | g = i; |
| 64 | f = i; |
| 65 | while (g > 0 && x[g] == x[g + m - 1 - f]) |
| 66 | --g; |
| 67 | suff[i] = f - g; |
| 68 | } |
| 69 | } |
| 70 | } |
| 71 | */ |
| 72 | |
| 73 | static void |
| 74 | grn_bm_preBmBc(const unsigned char *x, size_t m, size_t *bmBc) |
| 75 | { |
| 76 | size_t i; |
| 77 | for (i = 0; i < ASIZE; ++i) { |
| 78 | bmBc[i] = m; |
| 79 | } |
| 80 | for (i = 0; i < m - 1; ++i) { |
| 81 | bmBc[(unsigned int) x[i]] = m - (i + 1); |
| 82 | } |
| 83 | } |
| 84 | |
| 85 | #define GRN_BM_COMPARE do { \ |
| 86 | if (string_checks[found]) { \ |
| 87 | size_t offset = cond->last_offset, found_alpha_head = cond->found_alpha_head; \ |
| 88 | /* calc real offset */\ |
| 89 | for (i = cond->last_found; i < found; i++) { \ |
| 90 | if (string_checks[i] > 0) { \ |
| 91 | found_alpha_head = i; \ |
| 92 | offset += string_checks[i]; \ |
| 93 | } \ |
| 94 | } \ |
| 95 | /* if real offset is in a character, move it the head of the character */ \ |
| 96 | if (string_checks[found] < 0) { \ |
| 97 | offset -= string_checks[found_alpha_head]; \ |
| 98 | cond->last_found = found_alpha_head; \ |
| 99 | } else { \ |
| 100 | cond->last_found = found; \ |
| 101 | } \ |
| 102 | cond->start_offset = cond->last_offset = offset; \ |
| 103 | if (flags & GRN_SNIP_SKIP_LEADING_SPACES) { \ |
| 104 | while (cond->start_offset < string_original_length_in_bytes && \ |
| 105 | (i = grn_isspace(string_original + cond->start_offset, \ |
| 106 | string_encoding))) { cond->start_offset += i; } \ |
| 107 | } \ |
| 108 | for (i = cond->last_found; i < found + m; i++) { \ |
| 109 | if (string_checks[i] > 0) { \ |
| 110 | offset += string_checks[i]; \ |
| 111 | } \ |
| 112 | } \ |
| 113 | cond->end_offset = offset; \ |
| 114 | cond->found = found + shift; \ |
| 115 | cond->found_alpha_head = found_alpha_head; \ |
| 116 | /* printf("bm: cond:%p found:%zd last_found:%zd st_off:%zd ed_off:%zd\n", cond, cond->found,cond->last_found,cond->start_offset,cond->end_offset); */ \ |
| 117 | return; \ |
| 118 | } \ |
| 119 | } while (0) |
| 120 | |
| 121 | #define GRN_BM_BM_COMPARE do { \ |
| 122 | if (p[-2] == ck) { \ |
| 123 | for (i = 3; i <= m && p[-(intptr_t)i] == cp[-(intptr_t)i]; ++i) { \ |
| 124 | } \ |
| 125 | if (i > m) { \ |
| 126 | found = p - y - m; \ |
| 127 | GRN_BM_COMPARE; \ |
| 128 | } \ |
| 129 | } \ |
| 130 | } while (0) |
| 131 | |
| 132 | void |
| 133 | grn_bm_tunedbm(grn_ctx *ctx, snip_cond *cond, grn_obj *string, int flags) |
| 134 | { |
| 135 | register unsigned char *limit, ck; |
| 136 | register const unsigned char *p, *cp; |
| 137 | register size_t *bmBc, delta1, i; |
| 138 | |
| 139 | const unsigned char *x; |
| 140 | unsigned char *y; |
| 141 | size_t shift, found; |
| 142 | |
| 143 | const char *string_original; |
| 144 | unsigned int string_original_length_in_bytes; |
| 145 | const short *string_checks; |
| 146 | grn_encoding string_encoding; |
| 147 | const char *string_norm, *keyword_norm; |
| 148 | unsigned int n, m; |
| 149 | |
| 150 | grn_string_get_original(ctx, string, |
| 151 | &string_original, &string_original_length_in_bytes); |
| 152 | string_checks = grn_string_get_checks(ctx, string); |
| 153 | string_encoding = grn_string_get_encoding(ctx, string); |
| 154 | grn_string_get_normalized(ctx, string, &string_norm, &n, NULL); |
| 155 | grn_string_get_normalized(ctx, cond->keyword, &keyword_norm, &m, NULL); |
| 156 | |
| 157 | y = (unsigned char *)string_norm; |
| 158 | if (m == 1) { |
| 159 | if (n > cond->found) { |
| 160 | shift = 1; |
| 161 | p = memchr(y + cond->found, keyword_norm[0], n - cond->found); |
| 162 | if (p != NULL) { |
| 163 | found = p - y; |
| 164 | GRN_BM_COMPARE; |
| 165 | } |
| 166 | } |
| 167 | cond->stopflag = SNIPCOND_STOP; |
| 168 | return; |
| 169 | } |
| 170 | |
| 171 | x = (unsigned char *)keyword_norm; |
| 172 | bmBc = cond->bmBc; |
| 173 | shift = cond->shift; |
| 174 | |
| 175 | /* Restart */ |
| 176 | p = y + m + cond->found; |
| 177 | cp = x + m; |
| 178 | ck = cp[-2]; |
| 179 | |
| 180 | /* 12 means 1(initial offset) + 10 (in loop) + 1 (shift) */ |
| 181 | if (n - cond->found > 12 * m) { |
| 182 | limit = y + n - 11 * m; |
| 183 | while (p <= limit) { |
| 184 | p += bmBc[p[-1]]; |
| 185 | if(!(delta1 = bmBc[p[-1]])) { |
| 186 | goto check; |
| 187 | } |
| 188 | p += delta1; |
| 189 | p += bmBc[p[-1]]; |
| 190 | p += bmBc[p[-1]]; |
| 191 | if(!(delta1 = bmBc[p[-1]])) { |
| 192 | goto check; |
| 193 | } |
| 194 | p += delta1; |
| 195 | p += bmBc[p[-1]]; |
| 196 | p += bmBc[p[-1]]; |
| 197 | if(!(delta1 = bmBc[p[-1]])) { |
| 198 | goto check; |
| 199 | } |
| 200 | p += delta1; |
| 201 | p += bmBc[p[-1]]; |
| 202 | p += bmBc[p[-1]]; |
| 203 | continue; |
| 204 | check: |
| 205 | GRN_BM_BM_COMPARE; |
| 206 | p += shift; |
| 207 | } |
| 208 | } |
| 209 | /* limit check + search */ |
| 210 | limit = y + n; |
| 211 | while(p <= limit) { |
| 212 | if (!(delta1 = bmBc[p[-1]])) { |
| 213 | GRN_BM_BM_COMPARE; |
| 214 | p += shift; |
| 215 | } |
| 216 | p += delta1; |
| 217 | } |
| 218 | cond->stopflag = SNIPCOND_STOP; |
| 219 | } |
| 220 | |
| 221 | static size_t |
| 222 | count_mapped_chars(const char *str, const char *end) |
| 223 | { |
| 224 | const char *p; |
| 225 | size_t dl; |
| 226 | |
| 227 | dl = 0; |
| 228 | for (p = str; p != end; p++) { |
| 229 | switch (*p) { |
| 230 | case '<': |
| 231 | case '>': |
| 232 | dl += 4; /* < or > */ |
| 233 | break; |
| 234 | case '&': |
| 235 | dl += 5; /* & */ |
| 236 | break; |
| 237 | case '"': |
| 238 | dl += 6; /* " */ |
| 239 | break; |
| 240 | default: |
| 241 | dl++; |
| 242 | break; |
| 243 | } |
| 244 | } |
| 245 | return dl; |
| 246 | } |
| 247 | |
| 248 | grn_rc |
| 249 | grn_snip_cond_close(grn_ctx *ctx, snip_cond *cond) |
| 250 | { |
| 251 | if (!cond) { |
| 252 | return GRN_INVALID_ARGUMENT; |
| 253 | } |
| 254 | if (cond->keyword) { |
| 255 | grn_obj_close(ctx, cond->keyword); |
| 256 | } |
| 257 | return GRN_SUCCESS; |
| 258 | } |
| 259 | |
| 260 | grn_rc |
| 261 | grn_snip_cond_init(grn_ctx *ctx, snip_cond *sc, const char *keyword, unsigned int keyword_len, |
| 262 | grn_encoding enc, grn_obj *normalizer, int flags) |
| 263 | { |
| 264 | const char *norm; |
| 265 | unsigned int norm_blen; |
| 266 | int f = GRN_STR_REMOVEBLANK; |
| 267 | memset(sc, 0, sizeof(snip_cond)); |
| 268 | if (!(sc->keyword = grn_string_open(ctx, keyword, keyword_len, |
| 269 | normalizer, f))) { |
| 270 | GRN_LOG(ctx, GRN_LOG_ALERT, |
| 271 | "grn_string_open on snip_cond_init failed!" ); |
| 272 | return GRN_NO_MEMORY_AVAILABLE; |
| 273 | } |
| 274 | grn_string_get_normalized(ctx, sc->keyword, &norm, &norm_blen, NULL); |
| 275 | if (!norm_blen) { |
| 276 | grn_snip_cond_close(ctx, sc); |
| 277 | return GRN_INVALID_ARGUMENT; |
| 278 | } |
| 279 | if (norm_blen != 1) { |
| 280 | grn_bm_preBmBc((unsigned char *)norm, norm_blen, sc->bmBc); |
| 281 | sc->shift = sc->bmBc[(unsigned char)norm[norm_blen - 1]]; |
| 282 | sc->bmBc[(unsigned char)norm[norm_blen - 1]] = 0; |
| 283 | } |
| 284 | return GRN_SUCCESS; |
| 285 | } |
| 286 | |
| 287 | void |
| 288 | grn_snip_cond_reinit(snip_cond *cond) |
| 289 | { |
| 290 | cond->found = 0; |
| 291 | cond->last_found = 0; |
| 292 | cond->last_offset = 0; |
| 293 | cond->start_offset = 0; |
| 294 | cond->end_offset = 0; |
| 295 | |
| 296 | cond->count = 0; |
| 297 | cond->stopflag = SNIPCOND_NONSTOP; |
| 298 | } |
| 299 | |
| 300 | inline static char * |
| 301 | grn_snip_strndup(grn_ctx *ctx, const char *string, unsigned int string_len) |
| 302 | { |
| 303 | char *copied_string; |
| 304 | |
| 305 | copied_string = GRN_MALLOC(string_len + 1); |
| 306 | if (!copied_string) { |
| 307 | return NULL; |
| 308 | } |
| 309 | grn_memcpy(copied_string, string, string_len); |
| 310 | copied_string[string_len]= '\0'; /* not required, but for ql use */ |
| 311 | return copied_string; |
| 312 | } |
| 313 | |
| 314 | inline static grn_rc |
| 315 | grn_snip_cond_set_tag(grn_ctx *ctx, |
| 316 | const char **dest_tag, size_t *dest_tag_len, |
| 317 | const char *tag, unsigned int tag_len, |
| 318 | const char *default_tag, unsigned int default_tag_len, |
| 319 | int copy_tag) |
| 320 | { |
| 321 | if (tag) { |
| 322 | if (copy_tag) { |
| 323 | char *copied_tag; |
| 324 | copied_tag = grn_snip_strndup(ctx, tag, tag_len); |
| 325 | if (!copied_tag) { |
| 326 | return GRN_NO_MEMORY_AVAILABLE; |
| 327 | } |
| 328 | *dest_tag = copied_tag; |
| 329 | } else { |
| 330 | *dest_tag = tag; |
| 331 | } |
| 332 | *dest_tag_len = tag_len; |
| 333 | } else { |
| 334 | *dest_tag = default_tag; |
| 335 | *dest_tag_len = default_tag_len; |
| 336 | } |
| 337 | return GRN_SUCCESS; |
| 338 | } |
| 339 | |
| 340 | grn_rc |
| 341 | grn_snip_set_normalizer(grn_ctx *ctx, grn_obj *snip, |
| 342 | grn_obj *normalizer) |
| 343 | { |
| 344 | grn_snip *snip_; |
| 345 | if (!snip) { |
| 346 | return GRN_INVALID_ARGUMENT; |
| 347 | } |
| 348 | |
| 349 | snip_ = (grn_snip *)snip; |
| 350 | snip_->normalizer = normalizer; |
| 351 | return GRN_SUCCESS; |
| 352 | } |
| 353 | |
| 354 | grn_obj * |
| 355 | grn_snip_get_normalizer(grn_ctx *ctx, grn_obj *snip) |
| 356 | { |
| 357 | grn_snip *snip_; |
| 358 | |
| 359 | if (!snip) { |
| 360 | return NULL; |
| 361 | } |
| 362 | |
| 363 | snip_ = (grn_snip *)snip; |
| 364 | return snip_->normalizer; |
| 365 | } |
| 366 | |
| 367 | grn_rc |
| 368 | grn_snip_add_cond(grn_ctx *ctx, grn_obj *snip, |
| 369 | const char *keyword, unsigned int keyword_len, |
| 370 | const char *opentag, unsigned int opentag_len, |
| 371 | const char *closetag, unsigned int closetag_len) |
| 372 | { |
| 373 | grn_rc rc; |
| 374 | int copy_tag; |
| 375 | snip_cond *cond; |
| 376 | unsigned int norm_blen; |
| 377 | grn_snip *snip_; |
| 378 | |
| 379 | snip_ = (grn_snip *)snip; |
| 380 | if (!snip_ || !keyword || !keyword_len || snip_->cond_len >= MAX_SNIP_COND_COUNT) { |
| 381 | return GRN_INVALID_ARGUMENT; |
| 382 | } |
| 383 | |
| 384 | cond = snip_->cond + snip_->cond_len; |
| 385 | if ((rc = grn_snip_cond_init(ctx, cond, keyword, keyword_len, |
| 386 | snip_->encoding, snip_->normalizer, snip_->flags))) { |
| 387 | return rc; |
| 388 | } |
| 389 | grn_string_get_normalized(ctx, cond->keyword, NULL, &norm_blen, NULL); |
| 390 | if (norm_blen > snip_->width) { |
| 391 | grn_snip_cond_close(ctx, cond); |
| 392 | return GRN_INVALID_ARGUMENT; |
| 393 | } |
| 394 | |
| 395 | copy_tag = snip_->flags & GRN_SNIP_COPY_TAG; |
| 396 | rc = grn_snip_cond_set_tag(ctx, |
| 397 | &(cond->opentag), &(cond->opentag_len), |
| 398 | opentag, opentag_len, |
| 399 | snip_->defaultopentag, snip_->defaultopentag_len, |
| 400 | copy_tag); |
| 401 | if (rc) { |
| 402 | grn_snip_cond_close(ctx, cond); |
| 403 | return rc; |
| 404 | } |
| 405 | |
| 406 | rc = grn_snip_cond_set_tag(ctx, |
| 407 | &(cond->closetag), &(cond->closetag_len), |
| 408 | closetag, closetag_len, |
| 409 | snip_->defaultclosetag, snip_->defaultclosetag_len, |
| 410 | copy_tag); |
| 411 | if (rc) { |
| 412 | if (opentag && copy_tag) { |
| 413 | GRN_FREE((void *)cond->opentag); |
| 414 | } |
| 415 | grn_snip_cond_close(ctx, cond); |
| 416 | return rc; |
| 417 | } |
| 418 | |
| 419 | snip_->cond_len++; |
| 420 | return GRN_SUCCESS; |
| 421 | } |
| 422 | |
| 423 | static size_t |
| 424 | grn_snip_find_firstbyte(const char *string, grn_encoding encoding, size_t offset, |
| 425 | size_t doffset) |
| 426 | { |
| 427 | switch (encoding) { |
| 428 | case GRN_ENC_EUC_JP: |
| 429 | while (!(grn_bm_check_euc((unsigned char *) string, offset))) |
| 430 | offset += doffset; |
| 431 | break; |
| 432 | case GRN_ENC_SJIS: |
| 433 | if (!(grn_bm_check_sjis((unsigned char *) string, offset))) |
| 434 | offset += doffset; |
| 435 | break; |
| 436 | case GRN_ENC_UTF8: |
| 437 | while ((signed char)string[offset] <= (signed char)0xc0) |
| 438 | offset += doffset; |
| 439 | break; |
| 440 | default: |
| 441 | break; |
| 442 | } |
| 443 | return offset; |
| 444 | } |
| 445 | |
| 446 | inline static grn_rc |
| 447 | grn_snip_set_default_tag(grn_ctx *ctx, |
| 448 | const char **dest_tag, size_t *dest_tag_len, |
| 449 | const char *tag, unsigned int tag_len, |
| 450 | int copy_tag) |
| 451 | { |
| 452 | if (copy_tag && tag) { |
| 453 | char *copied_tag; |
| 454 | copied_tag = grn_snip_strndup(ctx, tag, tag_len); |
| 455 | if (!copied_tag) { |
| 456 | return GRN_NO_MEMORY_AVAILABLE; |
| 457 | } |
| 458 | *dest_tag = copied_tag; |
| 459 | } else { |
| 460 | *dest_tag = tag; |
| 461 | } |
| 462 | *dest_tag_len = tag_len; |
| 463 | return GRN_SUCCESS; |
| 464 | } |
| 465 | |
| 466 | grn_obj * |
| 467 | grn_snip_open(grn_ctx *ctx, int flags, unsigned int width, |
| 468 | unsigned int max_results, |
| 469 | const char *defaultopentag, unsigned int defaultopentag_len, |
| 470 | const char *defaultclosetag, unsigned int defaultclosetag_len, |
| 471 | grn_snip_mapping *mapping) |
| 472 | { |
| 473 | int copy_tag; |
| 474 | grn_snip *ret = NULL; |
| 475 | if (!(ret = GRN_MALLOC(sizeof(grn_snip)))) { |
| 476 | GRN_LOG(ctx, GRN_LOG_ALERT, "grn_snip allocation failed on grn_snip_open" ); |
| 477 | return NULL; |
| 478 | } |
| 479 | if (max_results > MAX_SNIP_RESULT_COUNT || max_results == 0) { |
| 480 | GRN_LOG(ctx, GRN_LOG_WARNING, "max_results is invalid on grn_snip_open" ); |
| 481 | GRN_FREE(ret); |
| 482 | return NULL; |
| 483 | } |
| 484 | GRN_API_ENTER; |
| 485 | ret->encoding = ctx->encoding; |
| 486 | ret->flags = flags; |
| 487 | ret->width = width; |
| 488 | ret->max_results = max_results; |
| 489 | ret->defaultopentag = NULL; |
| 490 | ret->defaultclosetag = NULL; |
| 491 | |
| 492 | copy_tag = flags & GRN_SNIP_COPY_TAG; |
| 493 | if (grn_snip_set_default_tag(ctx, |
| 494 | &(ret->defaultopentag), |
| 495 | &(ret->defaultopentag_len), |
| 496 | defaultopentag, defaultopentag_len, |
| 497 | copy_tag)) { |
| 498 | GRN_FREE(ret); |
| 499 | GRN_API_RETURN(NULL); |
| 500 | } |
| 501 | |
| 502 | if (grn_snip_set_default_tag(ctx, |
| 503 | &(ret->defaultclosetag), |
| 504 | &(ret->defaultclosetag_len), |
| 505 | defaultclosetag, defaultclosetag_len, |
| 506 | copy_tag)) { |
| 507 | if (copy_tag && ret->defaultopentag) { |
| 508 | GRN_FREE((void *)ret->defaultopentag); |
| 509 | } |
| 510 | GRN_FREE(ret); |
| 511 | GRN_API_RETURN(NULL); |
| 512 | } |
| 513 | |
| 514 | ret->cond_len = 0; |
| 515 | ret->mapping = mapping; |
| 516 | ret->nstr = NULL; |
| 517 | ret->tag_count = 0; |
| 518 | ret->snip_count = 0; |
| 519 | if (ret->flags & GRN_SNIP_NORMALIZE) { |
| 520 | ret->normalizer = GRN_NORMALIZER_AUTO; |
| 521 | } else { |
| 522 | ret->normalizer = NULL; |
| 523 | } |
| 524 | |
| 525 | GRN_DB_OBJ_SET_TYPE(ret, GRN_SNIP); |
| 526 | { |
| 527 | grn_obj *db; |
| 528 | grn_id id; |
| 529 | db = grn_ctx_db(ctx); |
| 530 | id = grn_obj_register(ctx, db, NULL, 0); |
| 531 | DB_OBJ(ret)->header.domain = GRN_ID_NIL; |
| 532 | DB_OBJ(ret)->range = GRN_ID_NIL; |
| 533 | grn_db_obj_init(ctx, db, id, DB_OBJ(ret)); |
| 534 | } |
| 535 | |
| 536 | GRN_API_RETURN((grn_obj *)ret); |
| 537 | } |
| 538 | |
| 539 | static grn_rc |
| 540 | exec_clean(grn_ctx *ctx, grn_snip *snip) |
| 541 | { |
| 542 | snip_cond *cond, *cond_end; |
| 543 | if (snip->nstr) { |
| 544 | grn_obj_close(ctx, snip->nstr); |
| 545 | snip->nstr = NULL; |
| 546 | } |
| 547 | snip->tag_count = 0; |
| 548 | snip->snip_count = 0; |
| 549 | for (cond = snip->cond, cond_end = cond + snip->cond_len; |
| 550 | cond < cond_end; cond++) { |
| 551 | grn_snip_cond_reinit(cond); |
| 552 | } |
| 553 | return GRN_SUCCESS; |
| 554 | } |
| 555 | |
| 556 | grn_rc |
| 557 | grn_snip_close(grn_ctx *ctx, grn_snip *snip) |
| 558 | { |
| 559 | snip_cond *cond, *cond_end; |
| 560 | if (!snip) { return GRN_INVALID_ARGUMENT; } |
| 561 | GRN_API_ENTER; |
| 562 | if (snip->flags & GRN_SNIP_COPY_TAG) { |
| 563 | int i; |
| 564 | snip_cond *sc; |
| 565 | const char *dot = snip->defaultopentag, *dct = snip->defaultclosetag; |
| 566 | for (i = snip->cond_len, sc = snip->cond; i; i--, sc++) { |
| 567 | if (sc->opentag != dot) { GRN_FREE((void *)sc->opentag); } |
| 568 | if (sc->closetag != dct) { GRN_FREE((void *)sc->closetag); } |
| 569 | } |
| 570 | if (dot) { GRN_FREE((void *)dot); } |
| 571 | if (dct) { GRN_FREE((void *)dct); } |
| 572 | } |
| 573 | if (snip->nstr) { |
| 574 | grn_obj_close(ctx, snip->nstr); |
| 575 | } |
| 576 | for (cond = snip->cond, cond_end = cond + snip->cond_len; |
| 577 | cond < cond_end; cond++) { |
| 578 | grn_snip_cond_close(ctx, cond); |
| 579 | } |
| 580 | GRN_FREE(snip); |
| 581 | GRN_API_RETURN(GRN_SUCCESS); |
| 582 | } |
| 583 | |
| 584 | grn_rc |
| 585 | grn_snip_exec(grn_ctx *ctx, grn_obj *snip, const char *string, unsigned int string_len, |
| 586 | unsigned int *nresults, unsigned int *max_tagged_len) |
| 587 | { |
| 588 | size_t i; |
| 589 | grn_snip *snip_; |
| 590 | int f = GRN_STR_WITH_CHECKS|GRN_STR_REMOVEBLANK; |
| 591 | if (!snip || !string || !nresults || !max_tagged_len) { |
| 592 | return GRN_INVALID_ARGUMENT; |
| 593 | } |
| 594 | GRN_API_ENTER; |
| 595 | snip_ = (grn_snip *)snip; |
| 596 | exec_clean(ctx, snip_); |
| 597 | *nresults = 0; |
| 598 | snip_->nstr = grn_string_open(ctx, string, string_len, snip_->normalizer, f); |
| 599 | if (!snip_->nstr) { |
| 600 | exec_clean(ctx, snip_); |
| 601 | GRN_LOG(ctx, GRN_LOG_ALERT, "grn_string_open on grn_snip_exec failed !" ); |
| 602 | GRN_API_RETURN(ctx->rc); |
| 603 | } |
| 604 | for (i = 0; i < snip_->cond_len; i++) { |
| 605 | grn_bm_tunedbm(ctx, snip_->cond + i, snip_->nstr, snip_->flags); |
| 606 | } |
| 607 | |
| 608 | { |
| 609 | _snip_tag_result *tag_result = snip_->tag_result; |
| 610 | _snip_result *snip_result = snip_->snip_result; |
| 611 | size_t last_end_offset = 0, last_last_end_offset = 0; |
| 612 | unsigned int unfound_cond_count = snip_->cond_len; |
| 613 | |
| 614 | *max_tagged_len = 0; |
| 615 | while (1) { |
| 616 | size_t tagged_len = 0, last_tag_end = 0; |
| 617 | int_least8_t all_stop = 1, found_cond = 0; |
| 618 | snip_result->tag_count = 0; |
| 619 | |
| 620 | while (1) { |
| 621 | size_t min_start_offset = (size_t) -1; |
| 622 | size_t max_end_offset = 0; |
| 623 | snip_cond *cond = NULL; |
| 624 | |
| 625 | /* get condition which have minimum offset and is not stopped */ |
| 626 | for (i = 0; i < snip_->cond_len; i++) { |
| 627 | if (snip_->cond[i].stopflag == SNIPCOND_NONSTOP && |
| 628 | (min_start_offset > snip_->cond[i].start_offset || |
| 629 | (min_start_offset == snip_->cond[i].start_offset && |
| 630 | max_end_offset < snip_->cond[i].end_offset))) { |
| 631 | min_start_offset = snip_->cond[i].start_offset; |
| 632 | max_end_offset = snip_->cond[i].end_offset; |
| 633 | cond = &snip_->cond[i]; |
| 634 | } |
| 635 | } |
| 636 | if (!cond) { |
| 637 | break; |
| 638 | } |
| 639 | /* check whether condtion is the first condition in snippet */ |
| 640 | if (snip_result->tag_count == 0) { |
| 641 | /* skip condition if the number of rest snippet field is smaller than */ |
| 642 | /* the number of unfound keywords. */ |
| 643 | if (snip_->max_results - *nresults <= unfound_cond_count && cond->count > 0) { |
| 644 | int_least8_t exclude_other_cond = 1; |
| 645 | for (i = 0; i < snip_->cond_len; i++) { |
| 646 | if ((snip_->cond + i) != cond |
| 647 | && snip_->cond[i].end_offset <= cond->start_offset + snip_->width |
| 648 | && snip_->cond[i].count == 0) { |
| 649 | exclude_other_cond = 0; |
| 650 | } |
| 651 | } |
| 652 | if (exclude_other_cond) { |
| 653 | grn_bm_tunedbm(ctx, cond, snip_->nstr, snip_->flags); |
| 654 | continue; |
| 655 | } |
| 656 | } |
| 657 | snip_result->start_offset = cond->start_offset; |
| 658 | snip_result->first_tag_result_idx = snip_->tag_count; |
| 659 | } else { |
| 660 | if (cond->start_offset >= snip_result->start_offset + snip_->width) { |
| 661 | break; |
| 662 | } |
| 663 | /* check nesting to make valid HTML */ |
| 664 | /* ToDo: allow <test><te>te</te><st>st</st></test> */ |
| 665 | if (cond->start_offset < last_tag_end) { |
| 666 | grn_bm_tunedbm(ctx, cond, snip_->nstr, snip_->flags); |
| 667 | continue; |
| 668 | } |
| 669 | } |
| 670 | if (cond->end_offset > snip_result->start_offset + snip_->width) { |
| 671 | /* If a keyword gets across a snippet, */ |
| 672 | /* it was skipped and never to be tagged. */ |
| 673 | cond->stopflag = SNIPCOND_ACROSS; |
| 674 | grn_bm_tunedbm(ctx, cond, snip_->nstr, snip_->flags); |
| 675 | } else { |
| 676 | found_cond = 1; |
| 677 | if (cond->count == 0) { |
| 678 | unfound_cond_count--; |
| 679 | } |
| 680 | cond->count++; |
| 681 | last_end_offset = cond->end_offset; |
| 682 | |
| 683 | tag_result->cond = cond; |
| 684 | tag_result->start_offset = cond->start_offset; |
| 685 | tag_result->end_offset = last_tag_end = cond->end_offset; |
| 686 | |
| 687 | snip_result->tag_count++; |
| 688 | tag_result++; |
| 689 | tagged_len += cond->opentag_len + cond->closetag_len; |
| 690 | if (++snip_->tag_count >= MAX_SNIP_TAG_COUNT) { |
| 691 | break; |
| 692 | } |
| 693 | grn_bm_tunedbm(ctx, cond, snip_->nstr, snip_->flags); |
| 694 | } |
| 695 | } |
| 696 | if (!found_cond) { |
| 697 | break; |
| 698 | } |
| 699 | if (snip_result->start_offset + last_end_offset < snip_->width) { |
| 700 | snip_result->start_offset = 0; |
| 701 | } else { |
| 702 | snip_result->start_offset = |
| 703 | MAX(MIN |
| 704 | ((snip_result->start_offset + last_end_offset - snip_->width) / 2, |
| 705 | string_len - snip_->width), last_last_end_offset); |
| 706 | } |
| 707 | snip_result->start_offset = |
| 708 | grn_snip_find_firstbyte(string, snip_->encoding, snip_result->start_offset, 1); |
| 709 | |
| 710 | snip_result->end_offset = snip_result->start_offset + snip_->width; |
| 711 | if (snip_result->end_offset < string_len) { |
| 712 | snip_result->end_offset = |
| 713 | grn_snip_find_firstbyte(string, snip_->encoding, snip_result->end_offset, -1); |
| 714 | } else { |
| 715 | snip_result->end_offset = string_len; |
| 716 | } |
| 717 | last_last_end_offset = snip_result->end_offset; |
| 718 | |
| 719 | if (snip_->mapping == (grn_snip_mapping *) -1) { |
| 720 | tagged_len += |
| 721 | count_mapped_chars(&string[snip_result->start_offset], |
| 722 | &string[snip_result->end_offset]) + 1; |
| 723 | } else { |
| 724 | tagged_len += snip_result->end_offset - snip_result->start_offset + 1; |
| 725 | } |
| 726 | |
| 727 | *max_tagged_len = MAX(*max_tagged_len, tagged_len); |
| 728 | |
| 729 | snip_result->last_tag_result_idx = snip_->tag_count - 1; |
| 730 | (*nresults)++; |
| 731 | snip_result++; |
| 732 | |
| 733 | if (*nresults == snip_->max_results || snip_->tag_count == MAX_SNIP_TAG_COUNT) { |
| 734 | break; |
| 735 | } |
| 736 | for (i = 0; i < snip_->cond_len; i++) { |
| 737 | if (snip_->cond[i].stopflag != SNIPCOND_STOP) { |
| 738 | all_stop = 0; |
| 739 | snip_->cond[i].stopflag = SNIPCOND_NONSTOP; |
| 740 | } |
| 741 | } |
| 742 | if (all_stop) { |
| 743 | break; |
| 744 | } |
| 745 | } |
| 746 | } |
| 747 | snip_->snip_count = *nresults; |
| 748 | snip_->string = string; |
| 749 | |
| 750 | snip_->max_tagged_len = *max_tagged_len; |
| 751 | |
| 752 | GRN_API_RETURN(ctx->rc); |
| 753 | } |
| 754 | |
| 755 | grn_rc |
| 756 | grn_snip_get_result(grn_ctx *ctx, grn_obj *snip, const unsigned int index, char *result, unsigned int *result_len) |
| 757 | { |
| 758 | char *p; |
| 759 | size_t i, j, k; |
| 760 | _snip_result *sres; |
| 761 | grn_snip *snip_; |
| 762 | |
| 763 | snip_ = (grn_snip *)snip; |
| 764 | if (snip_->snip_count <= index || !snip_->nstr) { |
| 765 | return GRN_INVALID_ARGUMENT; |
| 766 | } |
| 767 | |
| 768 | GRN_ASSERT(snip_->snip_count != 0 && snip_->tag_count != 0); |
| 769 | |
| 770 | GRN_API_ENTER; |
| 771 | sres = &snip_->snip_result[index]; |
| 772 | j = sres->first_tag_result_idx; |
| 773 | for (p = result, i = sres->start_offset; i < sres->end_offset; i++) { |
| 774 | for (; j <= sres->last_tag_result_idx && snip_->tag_result[j].start_offset == i; j++) { |
| 775 | if (snip_->tag_result[j].end_offset > sres->end_offset) { |
| 776 | continue; |
| 777 | } |
| 778 | grn_memcpy(p, |
| 779 | snip_->tag_result[j].cond->opentag, |
| 780 | snip_->tag_result[j].cond->opentag_len); |
| 781 | p += snip_->tag_result[j].cond->opentag_len; |
| 782 | } |
| 783 | |
| 784 | if (snip_->mapping == GRN_SNIP_MAPPING_HTML_ESCAPE) { |
| 785 | switch (snip_->string[i]) { |
| 786 | case '<': |
| 787 | *p++ = '&'; |
| 788 | *p++ = 'l'; |
| 789 | *p++ = 't'; |
| 790 | *p++ = ';'; |
| 791 | break; |
| 792 | case '>': |
| 793 | *p++ = '&'; |
| 794 | *p++ = 'g'; |
| 795 | *p++ = 't'; |
| 796 | *p++ = ';'; |
| 797 | break; |
| 798 | case '&': |
| 799 | *p++ = '&'; |
| 800 | *p++ = 'a'; |
| 801 | *p++ = 'm'; |
| 802 | *p++ = 'p'; |
| 803 | *p++ = ';'; |
| 804 | break; |
| 805 | case '"': |
| 806 | *p++ = '&'; |
| 807 | *p++ = 'q'; |
| 808 | *p++ = 'u'; |
| 809 | *p++ = 'o'; |
| 810 | *p++ = 't'; |
| 811 | *p++ = ';'; |
| 812 | break; |
| 813 | default: |
| 814 | *p++ = snip_->string[i]; |
| 815 | break; |
| 816 | } |
| 817 | } else { |
| 818 | *p++ = snip_->string[i]; |
| 819 | } |
| 820 | |
| 821 | for (k = sres->last_tag_result_idx; |
| 822 | snip_->tag_result[k].end_offset <= sres->end_offset; k--) { |
| 823 | /* TODO: avoid all loop */ |
| 824 | if (snip_->tag_result[k].end_offset == i + 1) { |
| 825 | grn_memcpy(p, |
| 826 | snip_->tag_result[k].cond->closetag, |
| 827 | snip_->tag_result[k].cond->closetag_len); |
| 828 | p += snip_->tag_result[k].cond->closetag_len; |
| 829 | } |
| 830 | if (k <= sres->first_tag_result_idx) { |
| 831 | break; |
| 832 | } |
| 833 | }; |
| 834 | } |
| 835 | *p = '\0'; |
| 836 | |
| 837 | if(result_len) { *result_len = (unsigned int)(p - result); } |
| 838 | GRN_ASSERT((unsigned int)(p - result) <= snip_->max_tagged_len); |
| 839 | |
| 840 | GRN_API_RETURN(ctx->rc); |
| 841 | } |
| 842 | |