1 | /* Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved. |
2 | |
3 | This program is free software; you can redistribute it and/or modify |
4 | it under the terms of the GNU General Public License as published by |
5 | the Free Software Foundation; version 2 of the License. |
6 | |
7 | This program is distributed in the hope that it will be useful, |
8 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
10 | GNU General Public License for more details. |
11 | |
12 | You should have received a copy of the GNU General Public License |
13 | along with this program; if not, write to the Free Software |
14 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ |
15 | |
16 | /* Written by Sergei A. Golubchik, who has a shared copyright to this code */ |
17 | |
18 | /* TODO: add caching - pre-read several index entries at once */ |
19 | |
20 | /* |
21 | Added optimization for full-text queries with plus-words. It was |
22 | implemented by sharing maximal document id (max_docid) variable |
23 | inside plus subtree. max_docid could be used by any word in plus |
24 | subtree, but it could be updated by plus-word only. |
25 | |
26 | Fulltext "smarter index merge" optimization assumes that rows |
27 | it gets are ordered by doc_id. That is not the case when we |
28 | search for a word with truncation operator. It may return |
29 | rows in random order. Thus we may not use "smarter index merge" |
30 | optimization with "trunc-words". |
31 | |
32 | The idea is: there is no need to search for docid smaller than |
33 | biggest docid inside current plus subtree or any upper plus subtree. |
34 | |
35 | Examples: |
36 | +word1 word2 |
37 | share same max_docid |
38 | max_docid updated by word1 |
39 | +word1 +(word2 word3) |
40 | share same max_docid |
41 | max_docid updated by word1 |
42 | +(word1 -word2) +(+word3 word4) |
43 | share same max_docid |
44 | max_docid updated by word3 |
45 | +word1 word2 (+word3 word4 (+word5 word6)) |
46 | three subexpressions (including the top-level one), |
47 | every one has its own max_docid, updated by its plus word. |
48 | but for the search word6 uses |
49 | max(word1.max_docid, word3.max_docid, word5.max_docid), |
50 | while word4 uses, accordingly, |
51 | max(word1.max_docid, word3.max_docid). |
52 | */ |
53 | |
54 | #define FT_CORE |
55 | #include "ftdefs.h" |
56 | |
57 | /* search with boolean queries */ |
58 | |
59 | static double _wghts[11]= |
60 | { |
61 | 0.131687242798354, |
62 | 0.197530864197531, |
63 | 0.296296296296296, |
64 | 0.444444444444444, |
65 | 0.666666666666667, |
66 | 1.000000000000000, |
67 | 1.500000000000000, |
68 | 2.250000000000000, |
69 | 3.375000000000000, |
70 | 5.062500000000000, |
71 | 7.593750000000000}; |
72 | static double *wghts=_wghts+5; /* wghts[i] = 1.5**i */ |
73 | |
74 | static double _nwghts[11]= |
75 | { |
76 | -0.065843621399177, |
77 | -0.098765432098766, |
78 | -0.148148148148148, |
79 | -0.222222222222222, |
80 | -0.333333333333334, |
81 | -0.500000000000000, |
82 | -0.750000000000000, |
83 | -1.125000000000000, |
84 | -1.687500000000000, |
85 | -2.531250000000000, |
86 | -3.796875000000000}; |
87 | static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */ |
88 | |
89 | #define FTB_FLAG_TRUNC 1 |
90 | /* At most one of the following flags can be set */ |
91 | #define FTB_FLAG_YES 2 |
92 | #define FTB_FLAG_NO 4 |
93 | #define FTB_FLAG_WONLY 8 |
94 | |
95 | typedef struct st_ftb_expr FTB_EXPR; |
96 | struct st_ftb_expr |
97 | { |
98 | FTB_EXPR *up; |
99 | uint flags; |
100 | /* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */ |
101 | my_off_t docid[2]; |
102 | my_off_t max_docid; |
103 | float weight; |
104 | float cur_weight; |
105 | LIST *phrase; /* phrase words */ |
106 | LIST *document; /* for phrase search */ |
107 | uint yesses; /* number of "yes" words matched */ |
108 | uint nos; /* number of "no" words matched */ |
109 | uint ythresh; /* number of "yes" words in expr */ |
110 | uint yweaks; /* number of "yes" words for scan only */ |
111 | }; |
112 | |
113 | typedef struct st_ftb_word |
114 | { |
115 | FTB_EXPR *up; |
116 | uint flags; |
117 | /* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */ |
118 | my_off_t docid[2]; /* for index search and for scan */ |
119 | my_off_t key_root; |
120 | FTB_EXPR *max_docid_expr; |
121 | MI_KEYDEF *keyinfo; |
122 | struct st_ftb_word *prev; |
123 | float weight; |
124 | uint ndepth; |
125 | uint len; |
126 | uchar off; |
127 | uchar word[1]; |
128 | } FTB_WORD; |
129 | |
130 | typedef struct st_ft_info |
131 | { |
132 | struct _ft_vft *please; |
133 | MI_INFO *info; |
134 | CHARSET_INFO *charset; |
135 | FTB_EXPR *root; |
136 | FTB_WORD **list; |
137 | FTB_WORD *last_word; |
138 | MEM_ROOT mem_root; |
139 | QUEUE queue; |
140 | TREE no_dupes; |
141 | my_off_t lastpos; |
142 | uint keynr; |
143 | uchar with_scan; |
144 | enum { UNINITIALIZED, READY, INDEX_SEARCH, INDEX_DONE } state; |
145 | } FTB; |
146 | |
147 | static int FTB_WORD_cmp(my_off_t *v, FTB_WORD *a, FTB_WORD *b) |
148 | { |
149 | int i; |
150 | |
151 | /* if a==curdoc, take it as a < b */ |
152 | if (v && a->docid[0] == *v) |
153 | return -1; |
154 | |
155 | /* ORDER BY docid, ndepth DESC */ |
156 | i=CMP_NUM(a->docid[0], b->docid[0]); |
157 | if (!i) |
158 | i=CMP_NUM(b->ndepth,a->ndepth); |
159 | return i; |
160 | } |
161 | |
162 | static int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b) |
163 | { |
164 | /* ORDER BY word, ndepth */ |
165 | int i= ha_compare_text(cs, (uchar*) (*a)->word + 1, (*a)->len - 1, |
166 | (uchar*) (*b)->word + 1, (*b)->len - 1, 0); |
167 | if (!i) |
168 | i= CMP_NUM((*a)->ndepth, (*b)->ndepth); |
169 | return i; |
170 | } |
171 | |
172 | |
173 | typedef struct st_my_ftb_param |
174 | { |
175 | FTB *ftb; |
176 | FTB_EXPR *ftbe; |
177 | uchar *up_quot; |
178 | uint depth; |
179 | } MY_FTB_PARAM; |
180 | |
181 | |
182 | static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param, |
183 | const char *word, int word_len, |
184 | MYSQL_FTPARSER_BOOLEAN_INFO *info) |
185 | { |
186 | MY_FTB_PARAM *ftb_param= param->mysql_ftparam; |
187 | FTB_WORD *ftbw; |
188 | FTB_EXPR *ftbe, *tmp_expr; |
189 | FT_WORD *phrase_word; |
190 | LIST *tmp_element; |
191 | int r= info->weight_adjust; |
192 | float weight= (float) |
193 | (info->wasign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)]; |
194 | |
195 | switch (info->type) { |
196 | case FT_TOKEN_WORD: |
197 | ftbw= (FTB_WORD *)alloc_root(&ftb_param->ftb->mem_root, |
198 | sizeof(FTB_WORD) + HA_MAX_KEY_BUFF); |
199 | ftbw->len= word_len + 1; |
200 | ftbw->flags= 0; |
201 | ftbw->off= 0; |
202 | if (info->yesno > 0) ftbw->flags|= FTB_FLAG_YES; |
203 | if (info->yesno < 0) ftbw->flags|= FTB_FLAG_NO; |
204 | if (info->trunc) ftbw->flags|= FTB_FLAG_TRUNC; |
205 | ftbw->weight= weight; |
206 | ftbw->up= ftb_param->ftbe; |
207 | ftbw->docid[0]= ftbw->docid[1]= HA_OFFSET_ERROR; |
208 | ftbw->ndepth= (info->yesno < 0) + ftb_param->depth; |
209 | ftbw->key_root= HA_OFFSET_ERROR; |
210 | memcpy(ftbw->word + 1, word, word_len); |
211 | ftbw->word[0]= word_len; |
212 | if (info->yesno > 0) ftbw->up->ythresh++; |
213 | ftb_param->ftb->queue.max_elements++; |
214 | ftbw->prev= ftb_param->ftb->last_word; |
215 | ftb_param->ftb->last_word= ftbw; |
216 | ftb_param->ftb->with_scan|= (info->trunc & FTB_FLAG_TRUNC); |
217 | for (tmp_expr= ftb_param->ftbe; tmp_expr->up; tmp_expr= tmp_expr->up) |
218 | if (! (tmp_expr->flags & FTB_FLAG_YES)) |
219 | break; |
220 | ftbw->max_docid_expr= tmp_expr; |
221 | /* fall through */ |
222 | case FT_TOKEN_STOPWORD: |
223 | if (! ftb_param->up_quot) break; |
224 | phrase_word= (FT_WORD *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD)); |
225 | tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST)); |
226 | phrase_word->pos= (uchar*) word; |
227 | phrase_word->len= word_len; |
228 | tmp_element->data= (void *)phrase_word; |
229 | ftb_param->ftbe->phrase= list_add(ftb_param->ftbe->phrase, tmp_element); |
230 | /* Allocate document list at this point. |
231 | It allows to avoid huge amount of allocs/frees for each row.*/ |
232 | tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST)); |
233 | tmp_element->data= alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD)); |
234 | ftb_param->ftbe->document= |
235 | list_add(ftb_param->ftbe->document, tmp_element); |
236 | break; |
237 | case FT_TOKEN_LEFT_PAREN: |
238 | ftbe=(FTB_EXPR *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FTB_EXPR)); |
239 | ftbe->flags= 0; |
240 | if (info->yesno > 0) ftbe->flags|= FTB_FLAG_YES; |
241 | if (info->yesno < 0) ftbe->flags|= FTB_FLAG_NO; |
242 | ftbe->weight= weight; |
243 | ftbe->up= ftb_param->ftbe; |
244 | ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0; |
245 | ftbe->docid[0]= ftbe->docid[1]= HA_OFFSET_ERROR; |
246 | ftbe->phrase= NULL; |
247 | ftbe->document= 0; |
248 | if (info->quot) ftb_param->ftb->with_scan|= 2; |
249 | if (info->yesno > 0) ftbe->up->ythresh++; |
250 | ftb_param->ftbe= ftbe; |
251 | ftb_param->depth++; |
252 | ftb_param->up_quot= (uchar*) info->quot; |
253 | break; |
254 | case FT_TOKEN_RIGHT_PAREN: |
255 | if (ftb_param->ftbe->document) |
256 | { |
257 | /* Circuit document list */ |
258 | for (tmp_element= ftb_param->ftbe->document; |
259 | tmp_element->next; tmp_element= tmp_element->next) /* no-op */; |
260 | tmp_element->next= ftb_param->ftbe->document; |
261 | ftb_param->ftbe->document->prev= tmp_element; |
262 | } |
263 | info->quot= 0; |
264 | if (ftb_param->ftbe->up) |
265 | { |
266 | DBUG_ASSERT(ftb_param->depth); |
267 | ftb_param->ftbe= ftb_param->ftbe->up; |
268 | ftb_param->depth--; |
269 | ftb_param->up_quot= 0; |
270 | } |
271 | break; |
272 | case FT_TOKEN_EOF: |
273 | default: |
274 | break; |
275 | } |
276 | return(0); |
277 | } |
278 | |
279 | |
280 | static int ftb_parse_query_internal(MYSQL_FTPARSER_PARAM *param, |
281 | const char *query, int len) |
282 | { |
283 | MY_FTB_PARAM *ftb_param= param->mysql_ftparam; |
284 | MYSQL_FTPARSER_BOOLEAN_INFO info; |
285 | CHARSET_INFO *cs= ftb_param->ftb->charset; |
286 | const uchar **start= (const uchar**) &query; |
287 | uchar *end= (uchar*) query + len; |
288 | FT_WORD w; |
289 | |
290 | info.prev= ' '; |
291 | info.quot= 0; |
292 | while (ft_get_word(cs, start, end, &w, &info)) |
293 | param->mysql_add_word(param, (char*) w.pos, (int)w.len, &info); |
294 | return(0); |
295 | } |
296 | |
297 | |
298 | static int _ftb_parse_query(FTB *ftb, uchar *query, uint len, |
299 | struct st_mysql_ftparser *parser) |
300 | { |
301 | MYSQL_FTPARSER_PARAM *param; |
302 | MY_FTB_PARAM ftb_param; |
303 | DBUG_ENTER("_ftb_parse_query" ); |
304 | DBUG_ASSERT(parser); |
305 | |
306 | if (ftb->state != UNINITIALIZED) |
307 | DBUG_RETURN(0); |
308 | if (! (param= ftparser_call_initializer(ftb->info, ftb->keynr, 0))) |
309 | DBUG_RETURN(1); |
310 | |
311 | ftb_param.ftb= ftb; |
312 | ftb_param.depth= 0; |
313 | ftb_param.ftbe= ftb->root; |
314 | ftb_param.up_quot= 0; |
315 | |
316 | param->mysql_parse= ftb_parse_query_internal; |
317 | param->mysql_add_word= ftb_query_add_word; |
318 | param->mysql_ftparam= (void *)&ftb_param; |
319 | param->cs= ftb->charset; |
320 | param->doc= (char*) query; |
321 | param->length= len; |
322 | param->flags= 0; |
323 | param->mode= MYSQL_FTPARSER_FULL_BOOLEAN_INFO; |
324 | DBUG_RETURN(parser->parse(param)); |
325 | } |
326 | |
327 | |
328 | static int _ftb_no_dupes_cmp(void* not_used __attribute__((unused)), |
329 | const void *a,const void *b) |
330 | { |
331 | return CMP_NUM((*((my_off_t*)a)), (*((my_off_t*)b))); |
332 | } |
333 | |
334 | /* |
335 | When performing prefix search (a word with truncation operator), we |
336 | must preserve original prefix to ensure that characters which may be |
337 | expanded/contracted do not break the prefix. This is done by storing |
338 | newly found key immediatly after the original word in ftbw->word |
339 | buffer. |
340 | |
341 | ftbw->word= LENGTH WORD [ LENGTH1 WORD1 ] WEIGHT REFERENCE |
342 | LENGTH - 1 byte, length of the WORD |
343 | WORD - LENGTH bytes, the word itself |
344 | LENGTH1 - 1 byte, length of the WORD1, present in case of prefix search |
345 | WORD1 - LENGTH bytes, the word itself, present in case of prefix search |
346 | WEIGHT - 4 bytes (HA_FT_WLEN), either weight or number of subkeys |
347 | REFERENCE - rec_reflength bytes, pointer to the record |
348 | |
349 | returns 1 if the search was finished (must-word wasn't found) |
350 | */ |
351 | static int _ft2_search_no_lock(FTB *ftb, FTB_WORD *ftbw, my_bool init_search) |
352 | { |
353 | int r; |
354 | int subkeys=1; |
355 | my_bool can_go_down; |
356 | MI_INFO *info=ftb->info; |
357 | uint UNINIT_VAR(off), = HA_FT_WLEN + info->s->rec_reflength; |
358 | uchar *lastkey_buf=ftbw->word+ftbw->off; |
359 | |
360 | if (ftbw->flags & FTB_FLAG_TRUNC) |
361 | lastkey_buf+=ftbw->len; |
362 | |
363 | if (init_search) |
364 | { |
365 | ftbw->key_root=info->s->state.key_root[ftb->keynr]; |
366 | ftbw->keyinfo=info->s->keyinfo+ftb->keynr; |
367 | |
368 | r=_mi_search(info, ftbw->keyinfo, (uchar*) ftbw->word, ftbw->len, |
369 | SEARCH_FIND | SEARCH_BIGGER, ftbw->key_root); |
370 | } |
371 | else |
372 | { |
373 | uint sflag= SEARCH_BIGGER; |
374 | my_off_t max_docid=0; |
375 | FTB_EXPR *tmp; |
376 | |
377 | for (tmp= ftbw->max_docid_expr; tmp; tmp= tmp->up) |
378 | set_if_bigger(max_docid, tmp->max_docid); |
379 | |
380 | if (ftbw->docid[0] < max_docid) |
381 | { |
382 | sflag|= SEARCH_SAME; |
383 | _mi_dpointer(info, (uchar*) (lastkey_buf + HA_FT_WLEN + |
384 | (ftbw->off ? 0 : lastkey_buf[0] + 1)), |
385 | max_docid); |
386 | } |
387 | r=_mi_search(info, ftbw->keyinfo, (uchar*) lastkey_buf, |
388 | USE_WHOLE_KEY, sflag, ftbw->key_root); |
389 | } |
390 | |
391 | can_go_down=(!ftbw->off && (init_search || (ftbw->flags & FTB_FLAG_TRUNC))); |
392 | /* Skip rows inserted by concurrent insert */ |
393 | while (!r) |
394 | { |
395 | if (can_go_down) |
396 | { |
397 | /* going down ? */ |
398 | off=info->lastkey_length-extra; |
399 | subkeys=ft_sintXkorr(info->lastkey+off); |
400 | } |
401 | if (subkeys<0 || info->lastpos < info->state->data_file_length) |
402 | break; |
403 | r= _mi_search_next(info, ftbw->keyinfo, info->lastkey, |
404 | info->lastkey_length, |
405 | SEARCH_BIGGER, ftbw->key_root); |
406 | } |
407 | |
408 | if (!r && !ftbw->off) |
409 | { |
410 | r= ha_compare_text(ftb->charset, |
411 | info->lastkey+1, |
412 | info->lastkey_length-extra-1, |
413 | (uchar*) ftbw->word+1, |
414 | ftbw->len-1, |
415 | (my_bool) (ftbw->flags & FTB_FLAG_TRUNC)); |
416 | } |
417 | |
418 | if (r) /* not found */ |
419 | { |
420 | if (!ftbw->off || !(ftbw->flags & FTB_FLAG_TRUNC)) |
421 | { |
422 | ftbw->docid[0]=HA_OFFSET_ERROR; |
423 | if ((ftbw->flags & FTB_FLAG_YES) && ftbw->up->up==0) |
424 | { |
425 | /* |
426 | This word MUST BE present in every document returned, |
427 | so we can stop the search right now |
428 | */ |
429 | ftb->state=INDEX_DONE; |
430 | return 1; /* search is done */ |
431 | } |
432 | else |
433 | return 0; |
434 | } |
435 | |
436 | /* |
437 | Going up to the first-level tree to continue search there. |
438 | Only done when performing prefix search. |
439 | |
440 | Key buffer data pointer as well as docid[0] may be smaller |
441 | than values we got while searching first-level tree. Thus |
442 | they must be restored to original values to avoid dead-loop, |
443 | when subsequent search for a bigger value eventually ends up |
444 | in this same second-level tree. |
445 | */ |
446 | _mi_dpointer(info, (uchar*) (lastkey_buf+HA_FT_WLEN), ftbw->key_root); |
447 | ftbw->docid[0]= ftbw->key_root; |
448 | ftbw->key_root=info->s->state.key_root[ftb->keynr]; |
449 | ftbw->keyinfo=info->s->keyinfo+ftb->keynr; |
450 | ftbw->off=0; |
451 | return _ft2_search_no_lock(ftb, ftbw, 0); |
452 | } |
453 | |
454 | /* matching key found */ |
455 | memcpy(lastkey_buf, info->lastkey, info->lastkey_length); |
456 | if (lastkey_buf == ftbw->word) |
457 | ftbw->len=info->lastkey_length-extra; |
458 | |
459 | /* going down ? */ |
460 | if (subkeys<0) |
461 | { |
462 | /* |
463 | yep, going down, to the second-level tree |
464 | TODO here: subkey-based optimization |
465 | */ |
466 | ftbw->off=off; |
467 | ftbw->key_root=info->lastpos; |
468 | ftbw->keyinfo=& info->s->ft2_keyinfo; |
469 | r=_mi_search_first(info, ftbw->keyinfo, ftbw->key_root); |
470 | DBUG_ASSERT(r==0); /* found something */ |
471 | memcpy(lastkey_buf+off, info->lastkey, info->lastkey_length); |
472 | } |
473 | ftbw->docid[0]=info->lastpos; |
474 | if (ftbw->flags & FTB_FLAG_YES && !(ftbw->flags & FTB_FLAG_TRUNC)) |
475 | ftbw->max_docid_expr->max_docid= info->lastpos; |
476 | return 0; |
477 | } |
478 | |
479 | static int _ft2_search(FTB *ftb, FTB_WORD *ftbw, my_bool init_search) |
480 | { |
481 | int r; |
482 | MYISAM_SHARE *share= ftb->info->s; |
483 | if (share->concurrent_insert) |
484 | mysql_rwlock_rdlock(&share->key_root_lock[ftb->keynr]); |
485 | r= _ft2_search_no_lock(ftb, ftbw, init_search); |
486 | if (share->concurrent_insert) |
487 | mysql_rwlock_unlock(&share->key_root_lock[ftb->keynr]); |
488 | return r; |
489 | } |
490 | |
491 | static void _ftb_init_index_search(FT_INFO *ftb) |
492 | { |
493 | int i; |
494 | FTB_WORD *ftbw; |
495 | |
496 | if (ftb->state == UNINITIALIZED || ftb->keynr == NO_SUCH_KEY) |
497 | return; |
498 | ftb->state=INDEX_SEARCH; |
499 | |
500 | for (i= queue_last_element(&ftb->queue); |
501 | i >= (int) queue_first_element(&ftb->queue); |
502 | i--) |
503 | { |
504 | ftbw=(FTB_WORD *)(queue_element(&ftb->queue, i)); |
505 | |
506 | if (ftbw->flags & FTB_FLAG_TRUNC) |
507 | { |
508 | /* |
509 | special treatment for truncation operator |
510 | 1. there are some (besides this) +words |
511 | | no need to search in the index, it can never ADD new rows |
512 | | to the result, and to remove half-matched rows we do scan anyway |
513 | 2. -trunc* |
514 | | same as 1. |
515 | 3. in 1 and 2, +/- need not be on the same expr. level, |
516 | but can be on any upper level, as in +word +(trunc1* trunc2*) |
517 | 4. otherwise |
518 | | We have to index-search for this prefix. |
519 | | It may cause duplicates, as in the index (sorted by <word,docid>) |
520 | | <aaaa,row1> |
521 | | <aabb,row2> |
522 | | <aacc,row1> |
523 | | Searching for "aa*" will find row1 twice... |
524 | */ |
525 | FTB_EXPR *ftbe; |
526 | for (ftbe=(FTB_EXPR*)ftbw; |
527 | ftbe->up && !(ftbe->up->flags & FTB_FLAG_TRUNC); |
528 | ftbe->up->flags|= FTB_FLAG_TRUNC, ftbe=ftbe->up) |
529 | { |
530 | if (ftbe->flags & FTB_FLAG_NO || /* 2 */ |
531 | ftbe->up->ythresh - ftbe->up->yweaks > |
532 | (uint) MY_TEST(ftbe->flags & FTB_FLAG_YES)) /* 1 */ |
533 | { |
534 | FTB_EXPR *top_ftbe=ftbe->up; |
535 | ftbw->docid[0]=HA_OFFSET_ERROR; |
536 | for (ftbe=(FTB_EXPR *)ftbw; |
537 | ftbe != top_ftbe && !(ftbe->flags & FTB_FLAG_NO); |
538 | ftbe=ftbe->up) |
539 | ftbe->up->yweaks++; |
540 | ftbe=0; |
541 | break; |
542 | } |
543 | } |
544 | if (!ftbe) |
545 | continue; |
546 | /* 4 */ |
547 | if (!is_tree_inited(& ftb->no_dupes)) |
548 | init_tree(& ftb->no_dupes,0,0,sizeof(my_off_t), |
549 | _ftb_no_dupes_cmp,0,0,MYF(0)); |
550 | else |
551 | reset_tree(& ftb->no_dupes); |
552 | } |
553 | |
554 | ftbw->off=0; /* in case of reinit */ |
555 | if (_ft2_search(ftb, ftbw, 1)) |
556 | return; |
557 | } |
558 | queue_fix(& ftb->queue); |
559 | } |
560 | |
561 | |
562 | FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, uchar *query, |
563 | uint query_len, CHARSET_INFO *cs) |
564 | { |
565 | FTB *ftb; |
566 | FTB_EXPR *ftbe; |
567 | FTB_WORD *ftbw; |
568 | |
569 | if (!(ftb=(FTB *)my_malloc(sizeof(FTB), MYF(MY_WME)))) |
570 | return 0; |
571 | ftb->please= (struct _ft_vft *) & _ft_vft_boolean; |
572 | ftb->state=UNINITIALIZED; |
573 | ftb->info=info; |
574 | ftb->keynr=keynr; |
575 | ftb->charset=cs; |
576 | DBUG_ASSERT(keynr==NO_SUCH_KEY || cs == info->s->keyinfo[keynr].seg->charset); |
577 | ftb->with_scan=0; |
578 | ftb->lastpos=HA_OFFSET_ERROR; |
579 | bzero(& ftb->no_dupes, sizeof(TREE)); |
580 | ftb->last_word= 0; |
581 | |
582 | init_alloc_root(&ftb->mem_root, "fulltext" , 1024, 1024, MYF(0)); |
583 | ftb->queue.max_elements= 0; |
584 | if (!(ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR)))) |
585 | goto err; |
586 | ftbe->weight=1; |
587 | ftbe->flags=FTB_FLAG_YES; |
588 | ftbe->nos=1; |
589 | ftbe->up=0; |
590 | ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0; |
591 | ftbe->docid[0]=ftbe->docid[1]=HA_OFFSET_ERROR; |
592 | ftbe->phrase= NULL; |
593 | ftbe->document= 0; |
594 | ftb->root=ftbe; |
595 | if (unlikely(_ftb_parse_query(ftb, query, query_len, |
596 | keynr == NO_SUCH_KEY ? &ft_default_parser : |
597 | info->s->keyinfo[keynr].parser))) |
598 | goto err; |
599 | /* |
600 | Hack: instead of init_queue, we'll use reinit queue to be able |
601 | to alloc queue with alloc_root() |
602 | */ |
603 | if (! (ftb->queue.root= (uchar **)alloc_root(&ftb->mem_root, |
604 | (ftb->queue.max_elements + 1) * |
605 | sizeof(void *)))) |
606 | goto err; |
607 | reinit_queue(&ftb->queue, ftb->queue.max_elements, 0, 0, |
608 | (int (*)(void*, uchar*, uchar*))FTB_WORD_cmp, 0, 0, 0); |
609 | for (ftbw= ftb->last_word; ftbw; ftbw= ftbw->prev) |
610 | queue_insert(&ftb->queue, (uchar *)ftbw); |
611 | ftb->list=(FTB_WORD **)alloc_root(&ftb->mem_root, |
612 | sizeof(FTB_WORD *)*ftb->queue.elements); |
613 | memcpy(ftb->list, &queue_top(&ftb->queue), sizeof(FTB_WORD *)*ftb->queue.elements); |
614 | my_qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *), |
615 | (qsort2_cmp)FTB_WORD_cmp_list, (void*)ftb->charset); |
616 | if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC; |
617 | ftb->state=READY; |
618 | return ftb; |
619 | err: |
620 | free_root(& ftb->mem_root, MYF(0)); |
621 | my_free(ftb); |
622 | return 0; |
623 | } |
624 | |
625 | |
626 | typedef struct st_my_ftb_phrase_param |
627 | { |
628 | LIST *phrase; |
629 | LIST *document; |
630 | CHARSET_INFO *cs; |
631 | uint phrase_length; |
632 | uint document_length; |
633 | uint match; |
634 | } MY_FTB_PHRASE_PARAM; |
635 | |
636 | |
637 | static int ftb_phrase_add_word(MYSQL_FTPARSER_PARAM *param, |
638 | const char *word, int word_len, |
639 | MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused))) |
640 | { |
641 | MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam; |
642 | FT_WORD *w= (FT_WORD *)phrase_param->document->data; |
643 | LIST *phrase, *document; |
644 | w->pos= (uchar*) word; |
645 | w->len= word_len; |
646 | phrase_param->document= phrase_param->document->prev; |
647 | if (phrase_param->phrase_length > phrase_param->document_length) |
648 | { |
649 | phrase_param->document_length++; |
650 | return 0; |
651 | } |
652 | /* TODO: rewrite phrase search to avoid |
653 | comparing the same word twice. */ |
654 | for (phrase= phrase_param->phrase, document= phrase_param->document->next; |
655 | phrase; phrase= phrase->next, document= document->next) |
656 | { |
657 | FT_WORD *phrase_word= (FT_WORD *)phrase->data; |
658 | FT_WORD *document_word= (FT_WORD *)document->data; |
659 | if (my_strnncoll(phrase_param->cs, (uchar*) phrase_word->pos, |
660 | phrase_word->len, |
661 | (uchar*) document_word->pos, document_word->len)) |
662 | return 0; |
663 | } |
664 | phrase_param->match++; |
665 | return 0; |
666 | } |
667 | |
668 | |
669 | static int ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM *param, |
670 | const char *document, int len) |
671 | { |
672 | FT_WORD word; |
673 | MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam; |
674 | const uchar *docend= (uchar*) document + len; |
675 | while (ft_simple_get_word(phrase_param->cs, (uchar**) &document, docend, |
676 | &word, FALSE)) |
677 | { |
678 | param->mysql_add_word(param, (char*) word.pos, (int)word.len, 0); |
679 | if (phrase_param->match) |
680 | break; |
681 | } |
682 | return 0; |
683 | } |
684 | |
685 | |
686 | /* |
687 | Checks if given buffer matches phrase list. |
688 | |
689 | SYNOPSIS |
690 | _ftb_check_phrase() |
691 | s0 start of buffer |
692 | e0 end of buffer |
693 | phrase broken into list phrase |
694 | cs charset info |
695 | |
696 | RETURN VALUE |
697 | 1 is returned if phrase found, 0 else. |
698 | -1 is returned if error occurs. |
699 | */ |
700 | |
701 | static int _ftb_check_phrase(FTB *ftb, const uchar *document, uint len, |
702 | FTB_EXPR *ftbe, struct st_mysql_ftparser *parser) |
703 | { |
704 | MY_FTB_PHRASE_PARAM ftb_param; |
705 | MYSQL_FTPARSER_PARAM *param; |
706 | DBUG_ENTER("_ftb_check_phrase" ); |
707 | DBUG_ASSERT(parser); |
708 | |
709 | if (! (param= ftparser_call_initializer(ftb->info, ftb->keynr, 1))) |
710 | DBUG_RETURN(0); |
711 | |
712 | ftb_param.phrase= ftbe->phrase; |
713 | ftb_param.document= ftbe->document; |
714 | ftb_param.cs= ftb->charset; |
715 | ftb_param.phrase_length= list_length(ftbe->phrase); |
716 | ftb_param.document_length= 1; |
717 | ftb_param.match= 0; |
718 | |
719 | param->mysql_parse= ftb_check_phrase_internal; |
720 | param->mysql_add_word= ftb_phrase_add_word; |
721 | param->mysql_ftparam= (void *)&ftb_param; |
722 | param->cs= ftb->charset; |
723 | param->doc= (char *) document; |
724 | param->length= len; |
725 | param->flags= 0; |
726 | param->mode= MYSQL_FTPARSER_WITH_STOPWORDS; |
727 | if (unlikely(parser->parse(param))) |
728 | return -1; |
729 | DBUG_RETURN(ftb_param.match ? 1 : 0); |
730 | } |
731 | |
732 | |
733 | static int _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig) |
734 | { |
735 | FT_SEG_ITERATOR ftsi; |
736 | FTB_EXPR *ftbe; |
737 | float weight=ftbw->weight; |
738 | int yn_flag= ftbw->flags, ythresh, mode=(ftsi_orig != 0); |
739 | my_off_t curdoc=ftbw->docid[mode]; |
740 | struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ? |
741 | &ft_default_parser : |
742 | ftb->info->s->keyinfo[ftb->keynr].parser; |
743 | |
744 | for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up) |
745 | { |
746 | ythresh = ftbe->ythresh - (mode ? 0 : ftbe->yweaks); |
747 | if (ftbe->docid[mode] != curdoc) |
748 | { |
749 | ftbe->cur_weight=0; |
750 | ftbe->yesses=ftbe->nos=0; |
751 | ftbe->docid[mode]=curdoc; |
752 | } |
753 | if (ftbe->nos) |
754 | break; |
755 | if (yn_flag & FTB_FLAG_YES) |
756 | { |
757 | weight /= ftbe->ythresh; |
758 | ftbe->cur_weight += weight; |
759 | if ((int) ++ftbe->yesses == ythresh) |
760 | { |
761 | yn_flag=ftbe->flags; |
762 | weight=ftbe->cur_weight*ftbe->weight; |
763 | if (mode && ftbe->phrase) |
764 | { |
765 | int found= 0; |
766 | |
767 | memcpy(&ftsi, ftsi_orig, sizeof(ftsi)); |
768 | while (_mi_ft_segiterator(&ftsi) && !found) |
769 | { |
770 | if (!ftsi.pos) |
771 | continue; |
772 | found= _ftb_check_phrase(ftb, ftsi.pos, ftsi.len, ftbe, parser); |
773 | if (unlikely(found < 0)) |
774 | return 1; |
775 | } |
776 | if (!found) |
777 | break; |
778 | } /* ftbe->quot */ |
779 | } |
780 | else |
781 | break; |
782 | } |
783 | else |
784 | if (yn_flag & FTB_FLAG_NO) |
785 | { |
786 | /* |
787 | NOTE: special sort function of queue assures that all |
788 | (yn_flag & FTB_FLAG_NO) != 0 |
789 | events for every particular subexpression will |
790 | "auto-magically" happen BEFORE all the |
791 | (yn_flag & FTB_FLAG_YES) != 0 events. So no |
792 | already matched expression can become not-matched again. |
793 | */ |
794 | ++ftbe->nos; |
795 | break; |
796 | } |
797 | else |
798 | { |
799 | if (ftbe->ythresh) |
800 | weight/=3; |
801 | ftbe->cur_weight += weight; |
802 | if ((int) ftbe->yesses < ythresh) |
803 | break; |
804 | if (!(yn_flag & FTB_FLAG_WONLY)) |
805 | yn_flag= ((int) ftbe->yesses++ == ythresh) ? ftbe->flags : FTB_FLAG_WONLY ; |
806 | weight*= ftbe->weight; |
807 | } |
808 | } |
809 | return 0; |
810 | } |
811 | |
812 | |
813 | int ft_boolean_read_next(FT_INFO *ftb, char *record) |
814 | { |
815 | FTB_EXPR *ftbe; |
816 | FTB_WORD *ftbw; |
817 | MI_INFO *info=ftb->info; |
818 | my_off_t curdoc; |
819 | |
820 | if (ftb->state != INDEX_SEARCH && ftb->state != INDEX_DONE) |
821 | return -1; |
822 | |
823 | /* black magic ON */ |
824 | if ((int) _mi_check_index(info, ftb->keynr) < 0) |
825 | return my_errno; |
826 | if (_mi_readinfo(info, F_RDLCK, 1)) |
827 | return my_errno; |
828 | /* black magic OFF */ |
829 | |
830 | if (!ftb->queue.elements) |
831 | return my_errno=HA_ERR_END_OF_FILE; |
832 | |
833 | /* Attention!!! Address of a local variable is used here! See err: label */ |
834 | ftb->queue.first_cmp_arg=(void *)&curdoc; |
835 | |
836 | while (ftb->state == INDEX_SEARCH && |
837 | (curdoc=((FTB_WORD *)queue_top(& ftb->queue))->docid[0]) != |
838 | HA_OFFSET_ERROR) |
839 | { |
840 | while (curdoc == (ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0]) |
841 | { |
842 | if (unlikely(_ftb_climb_the_tree(ftb, ftbw, 0))) |
843 | { |
844 | my_errno= HA_ERR_OUT_OF_MEM; |
845 | goto err; |
846 | } |
847 | |
848 | /* update queue */ |
849 | _ft2_search(ftb, ftbw, 0); |
850 | queue_replace_top(&ftb->queue); |
851 | } |
852 | |
853 | ftbe=ftb->root; |
854 | if (ftbe->docid[0]==curdoc && ftbe->cur_weight>0 && |
855 | ftbe->yesses>=(ftbe->ythresh-ftbe->yweaks) && !ftbe->nos) |
856 | { |
857 | /* curdoc matched ! */ |
858 | if (is_tree_inited(&ftb->no_dupes) && |
859 | tree_insert(&ftb->no_dupes, &curdoc, 0, |
860 | ftb->no_dupes.custom_arg)->count >1) |
861 | /* but it managed already to get past this line once */ |
862 | continue; |
863 | |
864 | info->lastpos=curdoc; |
865 | /* Clear all states, except that the table was updated */ |
866 | info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); |
867 | |
868 | if (!(*info->read_record)(info,curdoc, (uchar*) record)) |
869 | { |
870 | info->update|= HA_STATE_AKTIV; /* Record is read */ |
871 | if (ftb->with_scan && |
872 | ft_boolean_find_relevance(ftb,(uchar*) record,0)==0) |
873 | continue; /* no match */ |
874 | my_errno=0; |
875 | goto err; |
876 | } |
877 | goto err; |
878 | } |
879 | } |
880 | ftb->state=INDEX_DONE; |
881 | my_errno=HA_ERR_END_OF_FILE; |
882 | err: |
883 | ftb->queue.first_cmp_arg=(void *)0; |
884 | return my_errno; |
885 | } |
886 | |
887 | |
888 | typedef struct st_my_ftb_find_param |
889 | { |
890 | FT_INFO *ftb; |
891 | FT_SEG_ITERATOR *ftsi; |
892 | } MY_FTB_FIND_PARAM; |
893 | |
894 | |
895 | static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param, |
896 | const char *word, int len, |
897 | MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused))) |
898 | { |
899 | MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam; |
900 | FT_INFO *ftb= ftb_param->ftb; |
901 | FTB_WORD *ftbw; |
902 | int a, b, c; |
903 | /* |
904 | Find right-most element in the array of query words matching this |
905 | word from a document. |
906 | */ |
907 | for (a= 0, b= ftb->queue.elements, c= (a+b)/2; b-a>1; c= (a+b)/2) |
908 | { |
909 | ftbw= ftb->list[c]; |
910 | if (ha_compare_text(ftb->charset, (uchar*)word, len, |
911 | (uchar*)ftbw->word+1, ftbw->len-1, |
912 | (my_bool) (ftbw->flags & FTB_FLAG_TRUNC)) < 0) |
913 | b= c; |
914 | else |
915 | a= c; |
916 | } |
917 | /* |
918 | If there were no words with truncation operator, we iterate to the |
919 | beginning of an array until array element is equal to the word from |
920 | a document. This is done mainly because the same word may be |
921 | mentioned twice (or more) in the query. |
922 | |
923 | In case query has words with truncation operator we must iterate |
924 | to the beginning of the array. There may be non-matching query words |
925 | between matching word with truncation operator and the right-most |
926 | matching element. E.g., if we're looking for 'aaa15' in an array of |
927 | 'aaa1* aaa14 aaa15 aaa16'. |
928 | |
929 | Worse of that there still may be match even if the binary search |
930 | above didn't find matching element. E.g., if we're looking for |
931 | 'aaa15' in an array of 'aaa1* aaa14 aaa16'. The binary search will |
932 | stop at 'aaa16'. |
933 | */ |
934 | for (; c >= 0; c--) |
935 | { |
936 | ftbw= ftb->list[c]; |
937 | if (ha_compare_text(ftb->charset, (uchar*)word, len, |
938 | (uchar*)ftbw->word + 1,ftbw->len - 1, |
939 | (my_bool)(ftbw->flags & FTB_FLAG_TRUNC))) |
940 | { |
941 | if (ftb->with_scan & FTB_FLAG_TRUNC) |
942 | continue; |
943 | else |
944 | break; |
945 | } |
946 | if (ftbw->docid[1] == ftb->info->lastpos) |
947 | continue; |
948 | ftbw->docid[1]= ftb->info->lastpos; |
949 | if (unlikely(_ftb_climb_the_tree(ftb, ftbw, ftb_param->ftsi))) |
950 | return 1; |
951 | } |
952 | return(0); |
953 | } |
954 | |
955 | |
956 | static int ftb_find_relevance_parse(MYSQL_FTPARSER_PARAM *param, |
957 | const char *doc, int len) |
958 | { |
959 | MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam; |
960 | FT_INFO *ftb= ftb_param->ftb; |
961 | uchar *end= (uchar*) doc + len; |
962 | FT_WORD w; |
963 | while (ft_simple_get_word(ftb->charset, (uchar**) &doc, end, &w, TRUE)) |
964 | param->mysql_add_word(param, (char*) w.pos, (int)w.len, 0); |
965 | return(0); |
966 | } |
967 | |
968 | |
969 | float ft_boolean_find_relevance(FT_INFO *ftb, uchar *record, uint length) |
970 | { |
971 | FTB_EXPR *ftbe; |
972 | FT_SEG_ITERATOR ftsi, ftsi2; |
973 | my_off_t docid=ftb->info->lastpos; |
974 | MY_FTB_FIND_PARAM ftb_param; |
975 | MYSQL_FTPARSER_PARAM *param; |
976 | struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ? |
977 | &ft_default_parser : |
978 | ftb->info->s->keyinfo[ftb->keynr].parser; |
979 | |
980 | if (docid == HA_OFFSET_ERROR) |
981 | return -2.0; |
982 | if (!ftb->queue.elements) |
983 | return 0; |
984 | if (! (param= ftparser_call_initializer(ftb->info, ftb->keynr, 0))) |
985 | return 0; |
986 | |
987 | if (ftb->state != INDEX_SEARCH && docid <= ftb->lastpos) |
988 | { |
989 | FTB_EXPR *x; |
990 | uint i; |
991 | |
992 | for (i=0; i < ftb->queue.elements; i++) |
993 | { |
994 | ftb->list[i]->docid[1]=HA_OFFSET_ERROR; |
995 | for (x=ftb->list[i]->up; x; x=x->up) |
996 | x->docid[1]=HA_OFFSET_ERROR; |
997 | } |
998 | } |
999 | |
1000 | ftb->lastpos=docid; |
1001 | |
1002 | if (ftb->keynr==NO_SUCH_KEY) |
1003 | _mi_ft_segiterator_dummy_init(record, length, &ftsi); |
1004 | else |
1005 | _mi_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi); |
1006 | memcpy(&ftsi2, &ftsi, sizeof(ftsi)); |
1007 | |
1008 | ftb_param.ftb= ftb; |
1009 | ftb_param.ftsi= &ftsi2; |
1010 | param->mysql_parse= ftb_find_relevance_parse; |
1011 | param->mysql_add_word= ftb_find_relevance_add_word; |
1012 | param->mysql_ftparam= (void *)&ftb_param; |
1013 | param->flags= 0; |
1014 | param->cs= ftb->charset; |
1015 | param->mode= MYSQL_FTPARSER_SIMPLE_MODE; |
1016 | while (_mi_ft_segiterator(&ftsi)) |
1017 | { |
1018 | if (!ftsi.pos) |
1019 | continue; |
1020 | param->doc= (char *)ftsi.pos; |
1021 | param->length= ftsi.len; |
1022 | if (unlikely(parser->parse(param))) |
1023 | return 0; |
1024 | } |
1025 | ftbe=ftb->root; |
1026 | if (ftbe->docid[1]==docid && ftbe->cur_weight>0 && |
1027 | ftbe->yesses>=ftbe->ythresh && !ftbe->nos) |
1028 | { /* row matched ! */ |
1029 | return ftbe->cur_weight; |
1030 | } |
1031 | else |
1032 | { /* match failed ! */ |
1033 | return 0.0; |
1034 | } |
1035 | } |
1036 | |
1037 | |
1038 | void ft_boolean_close_search(FT_INFO *ftb) |
1039 | { |
1040 | if (is_tree_inited(& ftb->no_dupes)) |
1041 | { |
1042 | delete_tree(&ftb->no_dupes, 0); |
1043 | } |
1044 | free_root(& ftb->mem_root, MYF(0)); |
1045 | my_free(ftb); |
1046 | } |
1047 | |
1048 | |
1049 | float ft_boolean_get_relevance(FT_INFO *ftb) |
1050 | { |
1051 | return ftb->root->cur_weight; |
1052 | } |
1053 | |
1054 | |
1055 | void ft_boolean_reinit_search(FT_INFO *ftb) |
1056 | { |
1057 | _ftb_init_index_search(ftb); |
1058 | } |
1059 | |
1060 | |