1 | /* -*- c-basic-offset: 2 -*- */ |
2 | /* Copyright(C) 2009-2014 Brazil |
3 | |
4 | This library is free software; you can redistribute it and/or |
5 | modify it under the terms of the GNU Lesser General Public |
6 | License version 2.1 as published by the Free Software Foundation. |
7 | |
8 | This library is distributed in the hope that it will be useful, |
9 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
11 | Lesser General Public License for more details. |
12 | |
13 | You should have received a copy of the GNU Lesser General Public |
14 | License along with this library; if not, write to the Free Software |
15 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
16 | */ |
17 | #include "grn.h" |
18 | #include <string.h> |
19 | #include <stddef.h> |
20 | #include "grn_snip.h" |
21 | #include "grn_ctx.h" |
22 | |
23 | #if !defined MAX |
24 | #define MAX(a, b) ((a) > (b) ? (a) : (b)) |
25 | #endif |
26 | |
27 | #if !defined MIN |
28 | #define MIN(a, b) ((a) < (b) ? (a) : (b)) |
29 | #endif |
30 | |
31 | static int |
32 | grn_bm_check_euc(const unsigned char *x, const size_t y) |
33 | { |
34 | const unsigned char *p; |
35 | for (p = x + y - 1; p >= x && *p >= 0x80U; p--); |
36 | return (int) ((x + y - p) & 1); |
37 | } |
38 | |
39 | static int |
40 | grn_bm_check_sjis(const unsigned char *x, const size_t y) |
41 | { |
42 | const unsigned char *p; |
43 | for (p = x + y - 1; p >= x; p--) |
44 | if ((*p < 0x81U) || (*p > 0x9fU && *p < 0xe0U) || (*p > 0xfcU)) |
45 | break; |
46 | return (int) ((x + y - p) & 1); |
47 | } |
48 | |
49 | /* |
50 | static void |
51 | grn_bm_suffixes(const unsigned char *x, size_t m, size_t *suff) |
52 | { |
53 | size_t f, g; |
54 | intptr_t i; |
55 | f = 0; |
56 | suff[m - 1] = m; |
57 | g = m - 1; |
58 | for (i = m - 2; i >= 0; --i) { |
59 | if (i > (intptr_t) g && suff[i + m - 1 - f] < i - g) |
60 | suff[i] = suff[i + m - 1 - f]; |
61 | else { |
62 | if (i < (intptr_t) g) |
63 | g = i; |
64 | f = i; |
65 | while (g > 0 && x[g] == x[g + m - 1 - f]) |
66 | --g; |
67 | suff[i] = f - g; |
68 | } |
69 | } |
70 | } |
71 | */ |
72 | |
73 | static void |
74 | grn_bm_preBmBc(const unsigned char *x, size_t m, size_t *bmBc) |
75 | { |
76 | size_t i; |
77 | for (i = 0; i < ASIZE; ++i) { |
78 | bmBc[i] = m; |
79 | } |
80 | for (i = 0; i < m - 1; ++i) { |
81 | bmBc[(unsigned int) x[i]] = m - (i + 1); |
82 | } |
83 | } |
84 | |
85 | #define GRN_BM_COMPARE do { \ |
86 | if (string_checks[found]) { \ |
87 | size_t offset = cond->last_offset, found_alpha_head = cond->found_alpha_head; \ |
88 | /* calc real offset */\ |
89 | for (i = cond->last_found; i < found; i++) { \ |
90 | if (string_checks[i] > 0) { \ |
91 | found_alpha_head = i; \ |
92 | offset += string_checks[i]; \ |
93 | } \ |
94 | } \ |
95 | /* if real offset is in a character, move it the head of the character */ \ |
96 | if (string_checks[found] < 0) { \ |
97 | offset -= string_checks[found_alpha_head]; \ |
98 | cond->last_found = found_alpha_head; \ |
99 | } else { \ |
100 | cond->last_found = found; \ |
101 | } \ |
102 | cond->start_offset = cond->last_offset = offset; \ |
103 | if (flags & GRN_SNIP_SKIP_LEADING_SPACES) { \ |
104 | while (cond->start_offset < string_original_length_in_bytes && \ |
105 | (i = grn_isspace(string_original + cond->start_offset, \ |
106 | string_encoding))) { cond->start_offset += i; } \ |
107 | } \ |
108 | for (i = cond->last_found; i < found + m; i++) { \ |
109 | if (string_checks[i] > 0) { \ |
110 | offset += string_checks[i]; \ |
111 | } \ |
112 | } \ |
113 | cond->end_offset = offset; \ |
114 | cond->found = found + shift; \ |
115 | cond->found_alpha_head = found_alpha_head; \ |
116 | /* printf("bm: cond:%p found:%zd last_found:%zd st_off:%zd ed_off:%zd\n", cond, cond->found,cond->last_found,cond->start_offset,cond->end_offset); */ \ |
117 | return; \ |
118 | } \ |
119 | } while (0) |
120 | |
121 | #define GRN_BM_BM_COMPARE do { \ |
122 | if (p[-2] == ck) { \ |
123 | for (i = 3; i <= m && p[-(intptr_t)i] == cp[-(intptr_t)i]; ++i) { \ |
124 | } \ |
125 | if (i > m) { \ |
126 | found = p - y - m; \ |
127 | GRN_BM_COMPARE; \ |
128 | } \ |
129 | } \ |
130 | } while (0) |
131 | |
132 | void |
133 | grn_bm_tunedbm(grn_ctx *ctx, snip_cond *cond, grn_obj *string, int flags) |
134 | { |
135 | register unsigned char *limit, ck; |
136 | register const unsigned char *p, *cp; |
137 | register size_t *bmBc, delta1, i; |
138 | |
139 | const unsigned char *x; |
140 | unsigned char *y; |
141 | size_t shift, found; |
142 | |
143 | const char *string_original; |
144 | unsigned int string_original_length_in_bytes; |
145 | const short *string_checks; |
146 | grn_encoding string_encoding; |
147 | const char *string_norm, *keyword_norm; |
148 | unsigned int n, m; |
149 | |
150 | grn_string_get_original(ctx, string, |
151 | &string_original, &string_original_length_in_bytes); |
152 | string_checks = grn_string_get_checks(ctx, string); |
153 | string_encoding = grn_string_get_encoding(ctx, string); |
154 | grn_string_get_normalized(ctx, string, &string_norm, &n, NULL); |
155 | grn_string_get_normalized(ctx, cond->keyword, &keyword_norm, &m, NULL); |
156 | |
157 | y = (unsigned char *)string_norm; |
158 | if (m == 1) { |
159 | if (n > cond->found) { |
160 | shift = 1; |
161 | p = memchr(y + cond->found, keyword_norm[0], n - cond->found); |
162 | if (p != NULL) { |
163 | found = p - y; |
164 | GRN_BM_COMPARE; |
165 | } |
166 | } |
167 | cond->stopflag = SNIPCOND_STOP; |
168 | return; |
169 | } |
170 | |
171 | x = (unsigned char *)keyword_norm; |
172 | bmBc = cond->bmBc; |
173 | shift = cond->shift; |
174 | |
175 | /* Restart */ |
176 | p = y + m + cond->found; |
177 | cp = x + m; |
178 | ck = cp[-2]; |
179 | |
180 | /* 12 means 1(initial offset) + 10 (in loop) + 1 (shift) */ |
181 | if (n - cond->found > 12 * m) { |
182 | limit = y + n - 11 * m; |
183 | while (p <= limit) { |
184 | p += bmBc[p[-1]]; |
185 | if(!(delta1 = bmBc[p[-1]])) { |
186 | goto check; |
187 | } |
188 | p += delta1; |
189 | p += bmBc[p[-1]]; |
190 | p += bmBc[p[-1]]; |
191 | if(!(delta1 = bmBc[p[-1]])) { |
192 | goto check; |
193 | } |
194 | p += delta1; |
195 | p += bmBc[p[-1]]; |
196 | p += bmBc[p[-1]]; |
197 | if(!(delta1 = bmBc[p[-1]])) { |
198 | goto check; |
199 | } |
200 | p += delta1; |
201 | p += bmBc[p[-1]]; |
202 | p += bmBc[p[-1]]; |
203 | continue; |
204 | check: |
205 | GRN_BM_BM_COMPARE; |
206 | p += shift; |
207 | } |
208 | } |
209 | /* limit check + search */ |
210 | limit = y + n; |
211 | while(p <= limit) { |
212 | if (!(delta1 = bmBc[p[-1]])) { |
213 | GRN_BM_BM_COMPARE; |
214 | p += shift; |
215 | } |
216 | p += delta1; |
217 | } |
218 | cond->stopflag = SNIPCOND_STOP; |
219 | } |
220 | |
221 | static size_t |
222 | count_mapped_chars(const char *str, const char *end) |
223 | { |
224 | const char *p; |
225 | size_t dl; |
226 | |
227 | dl = 0; |
228 | for (p = str; p != end; p++) { |
229 | switch (*p) { |
230 | case '<': |
231 | case '>': |
232 | dl += 4; /* < or > */ |
233 | break; |
234 | case '&': |
235 | dl += 5; /* & */ |
236 | break; |
237 | case '"': |
238 | dl += 6; /* " */ |
239 | break; |
240 | default: |
241 | dl++; |
242 | break; |
243 | } |
244 | } |
245 | return dl; |
246 | } |
247 | |
248 | grn_rc |
249 | grn_snip_cond_close(grn_ctx *ctx, snip_cond *cond) |
250 | { |
251 | if (!cond) { |
252 | return GRN_INVALID_ARGUMENT; |
253 | } |
254 | if (cond->keyword) { |
255 | grn_obj_close(ctx, cond->keyword); |
256 | } |
257 | return GRN_SUCCESS; |
258 | } |
259 | |
260 | grn_rc |
261 | grn_snip_cond_init(grn_ctx *ctx, snip_cond *sc, const char *keyword, unsigned int keyword_len, |
262 | grn_encoding enc, grn_obj *normalizer, int flags) |
263 | { |
264 | const char *norm; |
265 | unsigned int norm_blen; |
266 | int f = GRN_STR_REMOVEBLANK; |
267 | memset(sc, 0, sizeof(snip_cond)); |
268 | if (!(sc->keyword = grn_string_open(ctx, keyword, keyword_len, |
269 | normalizer, f))) { |
270 | GRN_LOG(ctx, GRN_LOG_ALERT, |
271 | "grn_string_open on snip_cond_init failed!" ); |
272 | return GRN_NO_MEMORY_AVAILABLE; |
273 | } |
274 | grn_string_get_normalized(ctx, sc->keyword, &norm, &norm_blen, NULL); |
275 | if (!norm_blen) { |
276 | grn_snip_cond_close(ctx, sc); |
277 | return GRN_INVALID_ARGUMENT; |
278 | } |
279 | if (norm_blen != 1) { |
280 | grn_bm_preBmBc((unsigned char *)norm, norm_blen, sc->bmBc); |
281 | sc->shift = sc->bmBc[(unsigned char)norm[norm_blen - 1]]; |
282 | sc->bmBc[(unsigned char)norm[norm_blen - 1]] = 0; |
283 | } |
284 | return GRN_SUCCESS; |
285 | } |
286 | |
287 | void |
288 | grn_snip_cond_reinit(snip_cond *cond) |
289 | { |
290 | cond->found = 0; |
291 | cond->last_found = 0; |
292 | cond->last_offset = 0; |
293 | cond->start_offset = 0; |
294 | cond->end_offset = 0; |
295 | |
296 | cond->count = 0; |
297 | cond->stopflag = SNIPCOND_NONSTOP; |
298 | } |
299 | |
300 | inline static char * |
301 | grn_snip_strndup(grn_ctx *ctx, const char *string, unsigned int string_len) |
302 | { |
303 | char *copied_string; |
304 | |
305 | copied_string = GRN_MALLOC(string_len + 1); |
306 | if (!copied_string) { |
307 | return NULL; |
308 | } |
309 | grn_memcpy(copied_string, string, string_len); |
310 | copied_string[string_len]= '\0'; /* not required, but for ql use */ |
311 | return copied_string; |
312 | } |
313 | |
314 | inline static grn_rc |
315 | grn_snip_cond_set_tag(grn_ctx *ctx, |
316 | const char **dest_tag, size_t *dest_tag_len, |
317 | const char *tag, unsigned int tag_len, |
318 | const char *default_tag, unsigned int default_tag_len, |
319 | int copy_tag) |
320 | { |
321 | if (tag) { |
322 | if (copy_tag) { |
323 | char *copied_tag; |
324 | copied_tag = grn_snip_strndup(ctx, tag, tag_len); |
325 | if (!copied_tag) { |
326 | return GRN_NO_MEMORY_AVAILABLE; |
327 | } |
328 | *dest_tag = copied_tag; |
329 | } else { |
330 | *dest_tag = tag; |
331 | } |
332 | *dest_tag_len = tag_len; |
333 | } else { |
334 | *dest_tag = default_tag; |
335 | *dest_tag_len = default_tag_len; |
336 | } |
337 | return GRN_SUCCESS; |
338 | } |
339 | |
340 | grn_rc |
341 | grn_snip_set_normalizer(grn_ctx *ctx, grn_obj *snip, |
342 | grn_obj *normalizer) |
343 | { |
344 | grn_snip *snip_; |
345 | if (!snip) { |
346 | return GRN_INVALID_ARGUMENT; |
347 | } |
348 | |
349 | snip_ = (grn_snip *)snip; |
350 | snip_->normalizer = normalizer; |
351 | return GRN_SUCCESS; |
352 | } |
353 | |
354 | grn_obj * |
355 | grn_snip_get_normalizer(grn_ctx *ctx, grn_obj *snip) |
356 | { |
357 | grn_snip *snip_; |
358 | |
359 | if (!snip) { |
360 | return NULL; |
361 | } |
362 | |
363 | snip_ = (grn_snip *)snip; |
364 | return snip_->normalizer; |
365 | } |
366 | |
367 | grn_rc |
368 | grn_snip_add_cond(grn_ctx *ctx, grn_obj *snip, |
369 | const char *keyword, unsigned int keyword_len, |
370 | const char *opentag, unsigned int opentag_len, |
371 | const char *closetag, unsigned int closetag_len) |
372 | { |
373 | grn_rc rc; |
374 | int copy_tag; |
375 | snip_cond *cond; |
376 | unsigned int norm_blen; |
377 | grn_snip *snip_; |
378 | |
379 | snip_ = (grn_snip *)snip; |
380 | if (!snip_ || !keyword || !keyword_len || snip_->cond_len >= MAX_SNIP_COND_COUNT) { |
381 | return GRN_INVALID_ARGUMENT; |
382 | } |
383 | |
384 | cond = snip_->cond + snip_->cond_len; |
385 | if ((rc = grn_snip_cond_init(ctx, cond, keyword, keyword_len, |
386 | snip_->encoding, snip_->normalizer, snip_->flags))) { |
387 | return rc; |
388 | } |
389 | grn_string_get_normalized(ctx, cond->keyword, NULL, &norm_blen, NULL); |
390 | if (norm_blen > snip_->width) { |
391 | grn_snip_cond_close(ctx, cond); |
392 | return GRN_INVALID_ARGUMENT; |
393 | } |
394 | |
395 | copy_tag = snip_->flags & GRN_SNIP_COPY_TAG; |
396 | rc = grn_snip_cond_set_tag(ctx, |
397 | &(cond->opentag), &(cond->opentag_len), |
398 | opentag, opentag_len, |
399 | snip_->defaultopentag, snip_->defaultopentag_len, |
400 | copy_tag); |
401 | if (rc) { |
402 | grn_snip_cond_close(ctx, cond); |
403 | return rc; |
404 | } |
405 | |
406 | rc = grn_snip_cond_set_tag(ctx, |
407 | &(cond->closetag), &(cond->closetag_len), |
408 | closetag, closetag_len, |
409 | snip_->defaultclosetag, snip_->defaultclosetag_len, |
410 | copy_tag); |
411 | if (rc) { |
412 | if (opentag && copy_tag) { |
413 | GRN_FREE((void *)cond->opentag); |
414 | } |
415 | grn_snip_cond_close(ctx, cond); |
416 | return rc; |
417 | } |
418 | |
419 | snip_->cond_len++; |
420 | return GRN_SUCCESS; |
421 | } |
422 | |
423 | static size_t |
424 | grn_snip_find_firstbyte(const char *string, grn_encoding encoding, size_t offset, |
425 | size_t doffset) |
426 | { |
427 | switch (encoding) { |
428 | case GRN_ENC_EUC_JP: |
429 | while (!(grn_bm_check_euc((unsigned char *) string, offset))) |
430 | offset += doffset; |
431 | break; |
432 | case GRN_ENC_SJIS: |
433 | if (!(grn_bm_check_sjis((unsigned char *) string, offset))) |
434 | offset += doffset; |
435 | break; |
436 | case GRN_ENC_UTF8: |
437 | while ((signed char)string[offset] <= (signed char)0xc0) |
438 | offset += doffset; |
439 | break; |
440 | default: |
441 | break; |
442 | } |
443 | return offset; |
444 | } |
445 | |
446 | inline static grn_rc |
447 | grn_snip_set_default_tag(grn_ctx *ctx, |
448 | const char **dest_tag, size_t *dest_tag_len, |
449 | const char *tag, unsigned int tag_len, |
450 | int copy_tag) |
451 | { |
452 | if (copy_tag && tag) { |
453 | char *copied_tag; |
454 | copied_tag = grn_snip_strndup(ctx, tag, tag_len); |
455 | if (!copied_tag) { |
456 | return GRN_NO_MEMORY_AVAILABLE; |
457 | } |
458 | *dest_tag = copied_tag; |
459 | } else { |
460 | *dest_tag = tag; |
461 | } |
462 | *dest_tag_len = tag_len; |
463 | return GRN_SUCCESS; |
464 | } |
465 | |
466 | grn_obj * |
467 | grn_snip_open(grn_ctx *ctx, int flags, unsigned int width, |
468 | unsigned int max_results, |
469 | const char *defaultopentag, unsigned int defaultopentag_len, |
470 | const char *defaultclosetag, unsigned int defaultclosetag_len, |
471 | grn_snip_mapping *mapping) |
472 | { |
473 | int copy_tag; |
474 | grn_snip *ret = NULL; |
475 | if (!(ret = GRN_MALLOC(sizeof(grn_snip)))) { |
476 | GRN_LOG(ctx, GRN_LOG_ALERT, "grn_snip allocation failed on grn_snip_open" ); |
477 | return NULL; |
478 | } |
479 | if (max_results > MAX_SNIP_RESULT_COUNT || max_results == 0) { |
480 | GRN_LOG(ctx, GRN_LOG_WARNING, "max_results is invalid on grn_snip_open" ); |
481 | GRN_FREE(ret); |
482 | return NULL; |
483 | } |
484 | GRN_API_ENTER; |
485 | ret->encoding = ctx->encoding; |
486 | ret->flags = flags; |
487 | ret->width = width; |
488 | ret->max_results = max_results; |
489 | ret->defaultopentag = NULL; |
490 | ret->defaultclosetag = NULL; |
491 | |
492 | copy_tag = flags & GRN_SNIP_COPY_TAG; |
493 | if (grn_snip_set_default_tag(ctx, |
494 | &(ret->defaultopentag), |
495 | &(ret->defaultopentag_len), |
496 | defaultopentag, defaultopentag_len, |
497 | copy_tag)) { |
498 | GRN_FREE(ret); |
499 | GRN_API_RETURN(NULL); |
500 | } |
501 | |
502 | if (grn_snip_set_default_tag(ctx, |
503 | &(ret->defaultclosetag), |
504 | &(ret->defaultclosetag_len), |
505 | defaultclosetag, defaultclosetag_len, |
506 | copy_tag)) { |
507 | if (copy_tag && ret->defaultopentag) { |
508 | GRN_FREE((void *)ret->defaultopentag); |
509 | } |
510 | GRN_FREE(ret); |
511 | GRN_API_RETURN(NULL); |
512 | } |
513 | |
514 | ret->cond_len = 0; |
515 | ret->mapping = mapping; |
516 | ret->nstr = NULL; |
517 | ret->tag_count = 0; |
518 | ret->snip_count = 0; |
519 | if (ret->flags & GRN_SNIP_NORMALIZE) { |
520 | ret->normalizer = GRN_NORMALIZER_AUTO; |
521 | } else { |
522 | ret->normalizer = NULL; |
523 | } |
524 | |
525 | GRN_DB_OBJ_SET_TYPE(ret, GRN_SNIP); |
526 | { |
527 | grn_obj *db; |
528 | grn_id id; |
529 | db = grn_ctx_db(ctx); |
530 | id = grn_obj_register(ctx, db, NULL, 0); |
531 | DB_OBJ(ret)->header.domain = GRN_ID_NIL; |
532 | DB_OBJ(ret)->range = GRN_ID_NIL; |
533 | grn_db_obj_init(ctx, db, id, DB_OBJ(ret)); |
534 | } |
535 | |
536 | GRN_API_RETURN((grn_obj *)ret); |
537 | } |
538 | |
539 | static grn_rc |
540 | exec_clean(grn_ctx *ctx, grn_snip *snip) |
541 | { |
542 | snip_cond *cond, *cond_end; |
543 | if (snip->nstr) { |
544 | grn_obj_close(ctx, snip->nstr); |
545 | snip->nstr = NULL; |
546 | } |
547 | snip->tag_count = 0; |
548 | snip->snip_count = 0; |
549 | for (cond = snip->cond, cond_end = cond + snip->cond_len; |
550 | cond < cond_end; cond++) { |
551 | grn_snip_cond_reinit(cond); |
552 | } |
553 | return GRN_SUCCESS; |
554 | } |
555 | |
556 | grn_rc |
557 | grn_snip_close(grn_ctx *ctx, grn_snip *snip) |
558 | { |
559 | snip_cond *cond, *cond_end; |
560 | if (!snip) { return GRN_INVALID_ARGUMENT; } |
561 | GRN_API_ENTER; |
562 | if (snip->flags & GRN_SNIP_COPY_TAG) { |
563 | int i; |
564 | snip_cond *sc; |
565 | const char *dot = snip->defaultopentag, *dct = snip->defaultclosetag; |
566 | for (i = snip->cond_len, sc = snip->cond; i; i--, sc++) { |
567 | if (sc->opentag != dot) { GRN_FREE((void *)sc->opentag); } |
568 | if (sc->closetag != dct) { GRN_FREE((void *)sc->closetag); } |
569 | } |
570 | if (dot) { GRN_FREE((void *)dot); } |
571 | if (dct) { GRN_FREE((void *)dct); } |
572 | } |
573 | if (snip->nstr) { |
574 | grn_obj_close(ctx, snip->nstr); |
575 | } |
576 | for (cond = snip->cond, cond_end = cond + snip->cond_len; |
577 | cond < cond_end; cond++) { |
578 | grn_snip_cond_close(ctx, cond); |
579 | } |
580 | GRN_FREE(snip); |
581 | GRN_API_RETURN(GRN_SUCCESS); |
582 | } |
583 | |
584 | grn_rc |
585 | grn_snip_exec(grn_ctx *ctx, grn_obj *snip, const char *string, unsigned int string_len, |
586 | unsigned int *nresults, unsigned int *max_tagged_len) |
587 | { |
588 | size_t i; |
589 | grn_snip *snip_; |
590 | int f = GRN_STR_WITH_CHECKS|GRN_STR_REMOVEBLANK; |
591 | if (!snip || !string || !nresults || !max_tagged_len) { |
592 | return GRN_INVALID_ARGUMENT; |
593 | } |
594 | GRN_API_ENTER; |
595 | snip_ = (grn_snip *)snip; |
596 | exec_clean(ctx, snip_); |
597 | *nresults = 0; |
598 | snip_->nstr = grn_string_open(ctx, string, string_len, snip_->normalizer, f); |
599 | if (!snip_->nstr) { |
600 | exec_clean(ctx, snip_); |
601 | GRN_LOG(ctx, GRN_LOG_ALERT, "grn_string_open on grn_snip_exec failed !" ); |
602 | GRN_API_RETURN(ctx->rc); |
603 | } |
604 | for (i = 0; i < snip_->cond_len; i++) { |
605 | grn_bm_tunedbm(ctx, snip_->cond + i, snip_->nstr, snip_->flags); |
606 | } |
607 | |
608 | { |
609 | _snip_tag_result *tag_result = snip_->tag_result; |
610 | _snip_result *snip_result = snip_->snip_result; |
611 | size_t last_end_offset = 0, last_last_end_offset = 0; |
612 | unsigned int unfound_cond_count = snip_->cond_len; |
613 | |
614 | *max_tagged_len = 0; |
615 | while (1) { |
616 | size_t tagged_len = 0, last_tag_end = 0; |
617 | int_least8_t all_stop = 1, found_cond = 0; |
618 | snip_result->tag_count = 0; |
619 | |
620 | while (1) { |
621 | size_t min_start_offset = (size_t) -1; |
622 | size_t max_end_offset = 0; |
623 | snip_cond *cond = NULL; |
624 | |
625 | /* get condition which have minimum offset and is not stopped */ |
626 | for (i = 0; i < snip_->cond_len; i++) { |
627 | if (snip_->cond[i].stopflag == SNIPCOND_NONSTOP && |
628 | (min_start_offset > snip_->cond[i].start_offset || |
629 | (min_start_offset == snip_->cond[i].start_offset && |
630 | max_end_offset < snip_->cond[i].end_offset))) { |
631 | min_start_offset = snip_->cond[i].start_offset; |
632 | max_end_offset = snip_->cond[i].end_offset; |
633 | cond = &snip_->cond[i]; |
634 | } |
635 | } |
636 | if (!cond) { |
637 | break; |
638 | } |
639 | /* check whether condtion is the first condition in snippet */ |
640 | if (snip_result->tag_count == 0) { |
641 | /* skip condition if the number of rest snippet field is smaller than */ |
642 | /* the number of unfound keywords. */ |
643 | if (snip_->max_results - *nresults <= unfound_cond_count && cond->count > 0) { |
644 | int_least8_t exclude_other_cond = 1; |
645 | for (i = 0; i < snip_->cond_len; i++) { |
646 | if ((snip_->cond + i) != cond |
647 | && snip_->cond[i].end_offset <= cond->start_offset + snip_->width |
648 | && snip_->cond[i].count == 0) { |
649 | exclude_other_cond = 0; |
650 | } |
651 | } |
652 | if (exclude_other_cond) { |
653 | grn_bm_tunedbm(ctx, cond, snip_->nstr, snip_->flags); |
654 | continue; |
655 | } |
656 | } |
657 | snip_result->start_offset = cond->start_offset; |
658 | snip_result->first_tag_result_idx = snip_->tag_count; |
659 | } else { |
660 | if (cond->start_offset >= snip_result->start_offset + snip_->width) { |
661 | break; |
662 | } |
663 | /* check nesting to make valid HTML */ |
664 | /* ToDo: allow <test><te>te</te><st>st</st></test> */ |
665 | if (cond->start_offset < last_tag_end) { |
666 | grn_bm_tunedbm(ctx, cond, snip_->nstr, snip_->flags); |
667 | continue; |
668 | } |
669 | } |
670 | if (cond->end_offset > snip_result->start_offset + snip_->width) { |
671 | /* If a keyword gets across a snippet, */ |
672 | /* it was skipped and never to be tagged. */ |
673 | cond->stopflag = SNIPCOND_ACROSS; |
674 | grn_bm_tunedbm(ctx, cond, snip_->nstr, snip_->flags); |
675 | } else { |
676 | found_cond = 1; |
677 | if (cond->count == 0) { |
678 | unfound_cond_count--; |
679 | } |
680 | cond->count++; |
681 | last_end_offset = cond->end_offset; |
682 | |
683 | tag_result->cond = cond; |
684 | tag_result->start_offset = cond->start_offset; |
685 | tag_result->end_offset = last_tag_end = cond->end_offset; |
686 | |
687 | snip_result->tag_count++; |
688 | tag_result++; |
689 | tagged_len += cond->opentag_len + cond->closetag_len; |
690 | if (++snip_->tag_count >= MAX_SNIP_TAG_COUNT) { |
691 | break; |
692 | } |
693 | grn_bm_tunedbm(ctx, cond, snip_->nstr, snip_->flags); |
694 | } |
695 | } |
696 | if (!found_cond) { |
697 | break; |
698 | } |
699 | if (snip_result->start_offset + last_end_offset < snip_->width) { |
700 | snip_result->start_offset = 0; |
701 | } else { |
702 | snip_result->start_offset = |
703 | MAX(MIN |
704 | ((snip_result->start_offset + last_end_offset - snip_->width) / 2, |
705 | string_len - snip_->width), last_last_end_offset); |
706 | } |
707 | snip_result->start_offset = |
708 | grn_snip_find_firstbyte(string, snip_->encoding, snip_result->start_offset, 1); |
709 | |
710 | snip_result->end_offset = snip_result->start_offset + snip_->width; |
711 | if (snip_result->end_offset < string_len) { |
712 | snip_result->end_offset = |
713 | grn_snip_find_firstbyte(string, snip_->encoding, snip_result->end_offset, -1); |
714 | } else { |
715 | snip_result->end_offset = string_len; |
716 | } |
717 | last_last_end_offset = snip_result->end_offset; |
718 | |
719 | if (snip_->mapping == (grn_snip_mapping *) -1) { |
720 | tagged_len += |
721 | count_mapped_chars(&string[snip_result->start_offset], |
722 | &string[snip_result->end_offset]) + 1; |
723 | } else { |
724 | tagged_len += snip_result->end_offset - snip_result->start_offset + 1; |
725 | } |
726 | |
727 | *max_tagged_len = MAX(*max_tagged_len, tagged_len); |
728 | |
729 | snip_result->last_tag_result_idx = snip_->tag_count - 1; |
730 | (*nresults)++; |
731 | snip_result++; |
732 | |
733 | if (*nresults == snip_->max_results || snip_->tag_count == MAX_SNIP_TAG_COUNT) { |
734 | break; |
735 | } |
736 | for (i = 0; i < snip_->cond_len; i++) { |
737 | if (snip_->cond[i].stopflag != SNIPCOND_STOP) { |
738 | all_stop = 0; |
739 | snip_->cond[i].stopflag = SNIPCOND_NONSTOP; |
740 | } |
741 | } |
742 | if (all_stop) { |
743 | break; |
744 | } |
745 | } |
746 | } |
747 | snip_->snip_count = *nresults; |
748 | snip_->string = string; |
749 | |
750 | snip_->max_tagged_len = *max_tagged_len; |
751 | |
752 | GRN_API_RETURN(ctx->rc); |
753 | } |
754 | |
755 | grn_rc |
756 | grn_snip_get_result(grn_ctx *ctx, grn_obj *snip, const unsigned int index, char *result, unsigned int *result_len) |
757 | { |
758 | char *p; |
759 | size_t i, j, k; |
760 | _snip_result *sres; |
761 | grn_snip *snip_; |
762 | |
763 | snip_ = (grn_snip *)snip; |
764 | if (snip_->snip_count <= index || !snip_->nstr) { |
765 | return GRN_INVALID_ARGUMENT; |
766 | } |
767 | |
768 | GRN_ASSERT(snip_->snip_count != 0 && snip_->tag_count != 0); |
769 | |
770 | GRN_API_ENTER; |
771 | sres = &snip_->snip_result[index]; |
772 | j = sres->first_tag_result_idx; |
773 | for (p = result, i = sres->start_offset; i < sres->end_offset; i++) { |
774 | for (; j <= sres->last_tag_result_idx && snip_->tag_result[j].start_offset == i; j++) { |
775 | if (snip_->tag_result[j].end_offset > sres->end_offset) { |
776 | continue; |
777 | } |
778 | grn_memcpy(p, |
779 | snip_->tag_result[j].cond->opentag, |
780 | snip_->tag_result[j].cond->opentag_len); |
781 | p += snip_->tag_result[j].cond->opentag_len; |
782 | } |
783 | |
784 | if (snip_->mapping == GRN_SNIP_MAPPING_HTML_ESCAPE) { |
785 | switch (snip_->string[i]) { |
786 | case '<': |
787 | *p++ = '&'; |
788 | *p++ = 'l'; |
789 | *p++ = 't'; |
790 | *p++ = ';'; |
791 | break; |
792 | case '>': |
793 | *p++ = '&'; |
794 | *p++ = 'g'; |
795 | *p++ = 't'; |
796 | *p++ = ';'; |
797 | break; |
798 | case '&': |
799 | *p++ = '&'; |
800 | *p++ = 'a'; |
801 | *p++ = 'm'; |
802 | *p++ = 'p'; |
803 | *p++ = ';'; |
804 | break; |
805 | case '"': |
806 | *p++ = '&'; |
807 | *p++ = 'q'; |
808 | *p++ = 'u'; |
809 | *p++ = 'o'; |
810 | *p++ = 't'; |
811 | *p++ = ';'; |
812 | break; |
813 | default: |
814 | *p++ = snip_->string[i]; |
815 | break; |
816 | } |
817 | } else { |
818 | *p++ = snip_->string[i]; |
819 | } |
820 | |
821 | for (k = sres->last_tag_result_idx; |
822 | snip_->tag_result[k].end_offset <= sres->end_offset; k--) { |
823 | /* TODO: avoid all loop */ |
824 | if (snip_->tag_result[k].end_offset == i + 1) { |
825 | grn_memcpy(p, |
826 | snip_->tag_result[k].cond->closetag, |
827 | snip_->tag_result[k].cond->closetag_len); |
828 | p += snip_->tag_result[k].cond->closetag_len; |
829 | } |
830 | if (k <= sres->first_tag_result_idx) { |
831 | break; |
832 | } |
833 | }; |
834 | } |
835 | *p = '\0'; |
836 | |
837 | if(result_len) { *result_len = (unsigned int)(p - result); } |
838 | GRN_ASSERT((unsigned int)(p - result) <= snip_->max_tagged_len); |
839 | |
840 | GRN_API_RETURN(ctx->rc); |
841 | } |
842 | |