1 | /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB |
2 | |
3 | This program is free software; you can redistribute it and/or modify |
4 | it under the terms of the GNU General Public License as published by |
5 | the Free Software Foundation; version 2 of the License. |
6 | |
7 | This program is distributed in the hope that it will be useful, |
8 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
10 | GNU General Public License for more details. |
11 | |
12 | You should have received a copy of the GNU General Public License |
13 | along with this program; if not, write to the Free Software |
14 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */ |
15 | |
16 | /* Written by Sergei A. Golubchik, who has a shared copyright to this code */ |
17 | |
18 | /* TODO: add caching - pre-read several index entries at once */ |
19 | |
20 | /* |
21 | Added optimization for full-text queries with plus-words. It was |
22 | implemented by sharing maximal document id (max_docid) variable |
23 | inside plus subtree. max_docid could be used by any word in plus |
24 | subtree, but it could be updated by plus-word only. |
25 | |
26 | Fulltext "smarter index merge" optimization assumes that rows |
27 | it gets are ordered by doc_id. That is not the case when we |
28 | search for a word with truncation operator. It may return |
29 | rows in random order. Thus we may not use "smarter index merge" |
30 | optimization with "trunc-words". |
31 | |
32 | The idea is: there is no need to search for docid smaller than |
33 | biggest docid inside current plus subtree or any upper plus subtree. |
34 | |
35 | Examples: |
36 | +word1 word2 |
37 | share same max_docid |
38 | max_docid updated by word1 |
39 | +word1 +(word2 word3) |
40 | share same max_docid |
41 | max_docid updated by word1 |
42 | +(word1 -word2) +(+word3 word4) |
43 | share same max_docid |
44 | max_docid updated by word3 |
45 | +word1 word2 (+word3 word4 (+word5 word6)) |
46 | three subexpressions (including the top-level one), |
47 | every one has its own max_docid, updated by its plus word. |
48 | but for the search word6 uses |
49 | MY_MAX(word1.max_docid, word3.max_docid, word5.max_docid), |
50 | while word4 uses, accordingly, |
51 | MY_MAX(word1.max_docid, word3.max_docid). |
52 | */ |
53 | |
54 | #define FT_CORE |
55 | #include "ma_ftdefs.h" |
56 | |
57 | /* search with boolean queries */ |
58 | |
59 | static double _wghts[11]= |
60 | { |
61 | 0.131687242798354, |
62 | 0.197530864197531, |
63 | 0.296296296296296, |
64 | 0.444444444444444, |
65 | 0.666666666666667, |
66 | 1.000000000000000, |
67 | 1.500000000000000, |
68 | 2.250000000000000, |
69 | 3.375000000000000, |
70 | 5.062500000000000, |
71 | 7.593750000000000}; |
72 | static double *wghts=_wghts+5; /* wghts[i] = 1.5**i */ |
73 | |
74 | static double _nwghts[11]= |
75 | { |
76 | -0.065843621399177, |
77 | -0.098765432098766, |
78 | -0.148148148148148, |
79 | -0.222222222222222, |
80 | -0.333333333333334, |
81 | -0.500000000000000, |
82 | -0.750000000000000, |
83 | -1.125000000000000, |
84 | -1.687500000000000, |
85 | -2.531250000000000, |
86 | -3.796875000000000}; |
87 | static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */ |
88 | |
89 | #define FTB_FLAG_TRUNC 1 |
90 | /* At most one of the following flags can be set */ |
91 | #define FTB_FLAG_YES 2 |
92 | #define FTB_FLAG_NO 4 |
93 | #define FTB_FLAG_WONLY 8 |
94 | |
95 | typedef struct st_ftb_expr FTB_EXPR; |
96 | struct st_ftb_expr |
97 | { |
98 | FTB_EXPR *up; |
99 | uint flags; |
100 | /* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */ |
101 | my_off_t docid[2]; |
102 | my_off_t max_docid; |
103 | float weight; |
104 | float cur_weight; |
105 | LIST *phrase; /* phrase words */ |
106 | LIST *document; /* for phrase search */ |
107 | uint yesses; /* number of "yes" words matched */ |
108 | uint nos; /* number of "no" words matched */ |
109 | uint ythresh; /* number of "yes" words in expr */ |
110 | uint yweaks; /* number of "yes" words for scan only */ |
111 | }; |
112 | |
113 | typedef struct st_ftb_word |
114 | { |
115 | FTB_EXPR *up; |
116 | uint flags; |
117 | /* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */ |
118 | my_off_t docid[2]; /* for index search and for scan */ |
119 | my_off_t key_root; |
120 | FTB_EXPR *max_docid_expr; |
121 | MARIA_KEYDEF *keyinfo; |
122 | struct st_ftb_word *prev; |
123 | float weight; |
124 | uint ndepth; |
125 | uint len; |
126 | uchar off; |
127 | uchar word[1]; |
128 | } FTB_WORD; |
129 | |
130 | typedef struct st_ft_info |
131 | { |
132 | struct _ft_vft *please; |
133 | MARIA_HA *info; |
134 | CHARSET_INFO *charset; |
135 | FTB_EXPR *root; |
136 | FTB_WORD **list; |
137 | FTB_WORD *last_word; |
138 | MEM_ROOT mem_root; |
139 | QUEUE queue; |
140 | TREE no_dupes; |
141 | my_off_t lastpos; |
142 | uint keynr; |
143 | uchar with_scan; |
144 | enum { UNINITIALIZED, READY, INDEX_SEARCH, INDEX_DONE } state; |
145 | } FTB; |
146 | |
147 | static int FTB_WORD_cmp(my_off_t *v, FTB_WORD *a, FTB_WORD *b) |
148 | { |
149 | int i; |
150 | |
151 | /* if a==curdoc, take it as a < b */ |
152 | if (v && a->docid[0] == *v) |
153 | return -1; |
154 | |
155 | /* ORDER BY docid, ndepth DESC */ |
156 | i=CMP_NUM(a->docid[0], b->docid[0]); |
157 | if (!i) |
158 | i=CMP_NUM(b->ndepth,a->ndepth); |
159 | return i; |
160 | } |
161 | |
162 | static int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b) |
163 | { |
164 | /* ORDER BY word, ndepth */ |
165 | int i= ha_compare_text(cs, (uchar*) (*a)->word + 1,(*a)->len - 1, |
166 | (uchar*) (*b)->word + 1,(*b)->len - 1, 0); |
167 | if (!i) |
168 | i=CMP_NUM((*a)->ndepth, (*b)->ndepth); |
169 | return i; |
170 | } |
171 | |
172 | |
173 | typedef struct st_my_ftb_param |
174 | { |
175 | FTB *ftb; |
176 | FTB_EXPR *ftbe; |
177 | uchar *up_quot; |
178 | uint depth; |
179 | } MY_FTB_PARAM; |
180 | |
181 | |
182 | static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param, |
183 | const char *word, int word_len, |
184 | MYSQL_FTPARSER_BOOLEAN_INFO *info) |
185 | { |
186 | MY_FTB_PARAM *ftb_param= param->mysql_ftparam; |
187 | FTB_WORD *ftbw; |
188 | FTB_EXPR *ftbe, *tmp_expr; |
189 | FT_WORD *phrase_word; |
190 | LIST *tmp_element; |
191 | int r= info->weight_adjust; |
192 | float weight= (float) |
193 | (info->wasign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)]; |
194 | |
195 | switch (info->type) { |
196 | case FT_TOKEN_WORD: |
197 | ftbw= (FTB_WORD *)alloc_root(&ftb_param->ftb->mem_root, |
198 | sizeof(FTB_WORD) + |
199 | (info->trunc ? MARIA_MAX_KEY_BUFF : |
200 | word_len * ftb_param->ftb->charset->mbmaxlen + |
201 | HA_FT_WLEN + |
202 | ftb_param->ftb->info->s->rec_reflength)); |
203 | ftbw->len= word_len + 1; |
204 | ftbw->flags= 0; |
205 | ftbw->off= 0; |
206 | if (info->yesno > 0) ftbw->flags|= FTB_FLAG_YES; |
207 | if (info->yesno < 0) ftbw->flags|= FTB_FLAG_NO; |
208 | if (info->trunc) ftbw->flags|= FTB_FLAG_TRUNC; |
209 | ftbw->weight= weight; |
210 | ftbw->up= ftb_param->ftbe; |
211 | ftbw->docid[0]= ftbw->docid[1]= HA_OFFSET_ERROR; |
212 | ftbw->ndepth= (info->yesno < 0) + ftb_param->depth; |
213 | ftbw->key_root= HA_OFFSET_ERROR; |
214 | memcpy(ftbw->word + 1, word, word_len); |
215 | ftbw->word[0]= word_len; |
216 | if (info->yesno > 0) ftbw->up->ythresh++; |
217 | ftb_param->ftb->queue.max_elements++; |
218 | ftbw->prev= ftb_param->ftb->last_word; |
219 | ftb_param->ftb->last_word= ftbw; |
220 | ftb_param->ftb->with_scan|= (info->trunc & FTB_FLAG_TRUNC); |
221 | for (tmp_expr= ftb_param->ftbe; tmp_expr->up; tmp_expr= tmp_expr->up) |
222 | if (! (tmp_expr->flags & FTB_FLAG_YES)) |
223 | break; |
224 | ftbw->max_docid_expr= tmp_expr; |
225 | /* fall through */ |
226 | case FT_TOKEN_STOPWORD: |
227 | if (! ftb_param->up_quot) break; |
228 | phrase_word= (FT_WORD *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD)); |
229 | tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST)); |
230 | phrase_word->pos= (uchar*)word; |
231 | phrase_word->len= word_len; |
232 | tmp_element->data= (void *)phrase_word; |
233 | ftb_param->ftbe->phrase= list_add(ftb_param->ftbe->phrase, tmp_element); |
234 | /* Allocate document list at this point. |
235 | It allows to avoid huge amount of allocs/frees for each row.*/ |
236 | tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST)); |
237 | tmp_element->data= alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD)); |
238 | ftb_param->ftbe->document= |
239 | list_add(ftb_param->ftbe->document, tmp_element); |
240 | break; |
241 | case FT_TOKEN_LEFT_PAREN: |
242 | ftbe=(FTB_EXPR *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FTB_EXPR)); |
243 | ftbe->flags= 0; |
244 | if (info->yesno > 0) ftbe->flags|= FTB_FLAG_YES; |
245 | if (info->yesno < 0) ftbe->flags|= FTB_FLAG_NO; |
246 | ftbe->weight= weight; |
247 | ftbe->up= ftb_param->ftbe; |
248 | ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0; |
249 | ftbe->docid[0]= ftbe->docid[1]= HA_OFFSET_ERROR; |
250 | ftbe->phrase= NULL; |
251 | ftbe->document= 0; |
252 | if (info->quot) ftb_param->ftb->with_scan|= 2; |
253 | if (info->yesno > 0) ftbe->up->ythresh++; |
254 | ftb_param->ftbe= ftbe; |
255 | ftb_param->depth++; |
256 | ftb_param->up_quot= (uchar*)info->quot; |
257 | break; |
258 | case FT_TOKEN_RIGHT_PAREN: |
259 | if (ftb_param->ftbe->document) |
260 | { |
261 | /* Circuit document list */ |
262 | for (tmp_element= ftb_param->ftbe->document; |
263 | tmp_element->next; tmp_element= tmp_element->next) /* no-op */; |
264 | tmp_element->next= ftb_param->ftbe->document; |
265 | ftb_param->ftbe->document->prev= tmp_element; |
266 | } |
267 | info->quot= 0; |
268 | if (ftb_param->ftbe->up) |
269 | { |
270 | DBUG_ASSERT(ftb_param->depth); |
271 | ftb_param->ftbe= ftb_param->ftbe->up; |
272 | ftb_param->depth--; |
273 | ftb_param->up_quot= 0; |
274 | } |
275 | break; |
276 | case FT_TOKEN_EOF: |
277 | default: |
278 | break; |
279 | } |
280 | return(0); |
281 | } |
282 | |
283 | |
284 | static int ftb_parse_query_internal(MYSQL_FTPARSER_PARAM *param, |
285 | const char *query, int len) |
286 | { |
287 | MY_FTB_PARAM *ftb_param= param->mysql_ftparam; |
288 | MYSQL_FTPARSER_BOOLEAN_INFO info; |
289 | CHARSET_INFO *cs= ftb_param->ftb->charset; |
290 | const uchar **start= (const uchar**) &query; |
291 | uchar *end= (uchar*) query + len; |
292 | FT_WORD w; |
293 | |
294 | info.prev= ' '; |
295 | info.quot= 0; |
296 | while (maria_ft_get_word(cs, start, end, &w, &info)) |
297 | param->mysql_add_word(param, (char*)w.pos, w.len, &info); |
298 | return(0); |
299 | } |
300 | |
301 | |
302 | static int _ftb_parse_query(FTB *ftb, uchar *query, uint len, |
303 | struct st_mysql_ftparser *parser) |
304 | { |
305 | MYSQL_FTPARSER_PARAM *param; |
306 | MY_FTB_PARAM ftb_param; |
307 | DBUG_ENTER("_ftb_parse_query" ); |
308 | DBUG_ASSERT(parser); |
309 | |
310 | if (ftb->state != UNINITIALIZED) |
311 | DBUG_RETURN(0); |
312 | if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 0))) |
313 | DBUG_RETURN(1); |
314 | |
315 | ftb_param.ftb= ftb; |
316 | ftb_param.depth= 0; |
317 | ftb_param.ftbe= ftb->root; |
318 | ftb_param.up_quot= 0; |
319 | |
320 | param->mysql_parse= ftb_parse_query_internal; |
321 | param->mysql_add_word= ftb_query_add_word; |
322 | param->mysql_ftparam= (void *)&ftb_param; |
323 | param->cs= ftb->charset; |
324 | param->doc= (char*)query; |
325 | param->length= len; |
326 | param->flags= 0; |
327 | param->mode= MYSQL_FTPARSER_FULL_BOOLEAN_INFO; |
328 | DBUG_RETURN(parser->parse(param)); |
329 | } |
330 | |
331 | |
332 | static int _ftb_no_dupes_cmp(void* not_used __attribute__((unused)), |
333 | const void *a,const void *b) |
334 | { |
335 | return CMP_NUM((*((my_off_t*)a)), (*((my_off_t*)b))); |
336 | } |
337 | |
338 | |
339 | /* returns 1 if the search was finished (must-word wasn't found) */ |
340 | |
341 | static int _ft2_search_no_lock(FTB *ftb, FTB_WORD *ftbw, my_bool init_search) |
342 | { |
343 | int r; |
344 | int subkeys=1; |
345 | my_bool can_go_down; |
346 | MARIA_HA *info=ftb->info; |
347 | uint UNINIT_VAR(off), =HA_FT_WLEN+info->s->base.rec_reflength; |
348 | uchar *lastkey_buf= ftbw->word+ftbw->off; |
349 | MARIA_KEY key; |
350 | |
351 | if (ftbw->flags & FTB_FLAG_TRUNC) |
352 | lastkey_buf+=ftbw->len; |
353 | |
354 | if (init_search) |
355 | { |
356 | ftbw->key_root=info->s->state.key_root[ftb->keynr]; |
357 | ftbw->keyinfo=info->s->keyinfo+ftb->keynr; |
358 | info->last_key.keyinfo= key.keyinfo= ftbw->keyinfo; |
359 | info->lastinx= ~0; /* Safety */ |
360 | key.data= ftbw->word; |
361 | key.data_length= ftbw->len; |
362 | key.ref_length= 0; |
363 | key.flag= 0; |
364 | |
365 | r= _ma_search(info, &key, SEARCH_FIND | SEARCH_BIGGER, ftbw->key_root); |
366 | } |
367 | else |
368 | { |
369 | uint sflag= SEARCH_BIGGER; |
370 | my_off_t max_docid=0; |
371 | FTB_EXPR *tmp; |
372 | |
373 | for (tmp= ftbw->max_docid_expr; tmp; tmp= tmp->up) |
374 | set_if_bigger(max_docid, tmp->max_docid); |
375 | |
376 | if (ftbw->docid[0] < max_docid) |
377 | { |
378 | sflag|= SEARCH_SAME; |
379 | _ma_dpointer(info->s, (uchar*) (ftbw->word + ftbw->len + HA_FT_WLEN), |
380 | max_docid); |
381 | } |
382 | |
383 | info->last_key.keyinfo= key.keyinfo= ftbw->keyinfo; |
384 | info->lastinx= ~0; /* Safety */ |
385 | key.data= lastkey_buf; |
386 | key.data_length= USE_WHOLE_KEY; |
387 | key.ref_length= 0; |
388 | key.flag= 0; |
389 | |
390 | r= _ma_search(info, &key, sflag, ftbw->key_root); |
391 | } |
392 | |
393 | can_go_down=(!ftbw->off && (init_search || (ftbw->flags & FTB_FLAG_TRUNC))); |
394 | /* Skip rows inserted by concurrent insert */ |
395 | while (!r) |
396 | { |
397 | if (can_go_down) |
398 | { |
399 | /* going down ? */ |
400 | off= info->last_key.data_length + info->last_key.ref_length - extra; |
401 | subkeys=ft_sintXkorr(info->last_key.data + off); |
402 | } |
403 | if (subkeys<0 || info->cur_row.lastpos < info->state->data_file_length) |
404 | break; |
405 | r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, ftbw->key_root); |
406 | } |
407 | |
408 | if (!r && !ftbw->off) |
409 | { |
410 | r= ha_compare_text(ftb->charset, |
411 | info->last_key.data+1, |
412 | info->last_key.data_length + info->last_key.ref_length- |
413 | extra-1, |
414 | (uchar*) ftbw->word+1, |
415 | ftbw->len-1, |
416 | (my_bool) (ftbw->flags & FTB_FLAG_TRUNC)); |
417 | } |
418 | |
419 | if (r) /* not found */ |
420 | { |
421 | if (!ftbw->off || !(ftbw->flags & FTB_FLAG_TRUNC)) |
422 | { |
423 | ftbw->docid[0]=HA_OFFSET_ERROR; |
424 | if ((ftbw->flags & FTB_FLAG_YES) && ftbw->up->up==0) |
425 | { |
426 | /* |
427 | This word MUST BE present in every document returned, |
428 | so we can stop the search right now |
429 | */ |
430 | ftb->state=INDEX_DONE; |
431 | return 1; /* search is done */ |
432 | } |
433 | else |
434 | return 0; |
435 | } |
436 | |
437 | /* going up to the first-level tree to continue search there */ |
438 | _ma_dpointer(info->s, (lastkey_buf+HA_FT_WLEN), ftbw->key_root); |
439 | ftbw->key_root=info->s->state.key_root[ftb->keynr]; |
440 | ftbw->keyinfo=info->s->keyinfo+ftb->keynr; |
441 | ftbw->off=0; |
442 | return _ft2_search_no_lock(ftb, ftbw, 0); |
443 | } |
444 | |
445 | /* matching key found */ |
446 | memcpy(lastkey_buf, info->last_key.data, |
447 | info->last_key.data_length + info->last_key.ref_length); |
448 | if (lastkey_buf == ftbw->word) |
449 | ftbw->len= info->last_key.data_length + info->last_key.ref_length - extra; |
450 | |
451 | /* going down ? */ |
452 | if (subkeys<0) |
453 | { |
454 | /* |
455 | yep, going down, to the second-level tree |
456 | TODO here: subkey-based optimization |
457 | */ |
458 | ftbw->off=off; |
459 | ftbw->key_root= info->cur_row.lastpos; |
460 | ftbw->keyinfo=& info->s->ft2_keyinfo; |
461 | r= _ma_search_first(info, ftbw->keyinfo, ftbw->key_root); |
462 | DBUG_ASSERT(r==0); /* found something */ |
463 | memcpy(lastkey_buf+off, info->last_key.data, |
464 | info->last_key.data_length + info->last_key.ref_length); |
465 | } |
466 | ftbw->docid[0]= info->cur_row.lastpos; |
467 | if (ftbw->flags & FTB_FLAG_YES && !(ftbw->flags & FTB_FLAG_TRUNC)) |
468 | ftbw->max_docid_expr->max_docid= info->cur_row.lastpos; |
469 | return 0; |
470 | } |
471 | |
472 | static int _ft2_search(FTB *ftb, FTB_WORD *ftbw, my_bool init_search) |
473 | { |
474 | int r; |
475 | MARIA_SHARE *share= ftb->info->s; |
476 | if (share->lock_key_trees) |
477 | mysql_rwlock_rdlock(&share->keyinfo[ftb->keynr].root_lock); |
478 | r= _ft2_search_no_lock(ftb, ftbw, init_search); |
479 | if (share->lock_key_trees) |
480 | mysql_rwlock_unlock(&share->keyinfo[ftb->keynr].root_lock); |
481 | return r; |
482 | } |
483 | |
484 | |
485 | static void _ftb_init_index_search(FT_INFO *ftb) |
486 | { |
487 | int i; |
488 | FTB_WORD *ftbw; |
489 | |
490 | if (ftb->state == UNINITIALIZED || ftb->keynr == NO_SUCH_KEY) |
491 | return; |
492 | ftb->state=INDEX_SEARCH; |
493 | |
494 | for (i= queue_last_element(&ftb->queue); |
495 | (int) i >= (int) queue_first_element(&ftb->queue); |
496 | i--) |
497 | { |
498 | ftbw=(FTB_WORD *)(queue_element(&ftb->queue, i)); |
499 | |
500 | if (ftbw->flags & FTB_FLAG_TRUNC) |
501 | { |
502 | /* |
503 | special treatment for truncation operator |
504 | 1. there are some (besides this) +words |
505 | | no need to search in the index, it can never ADD new rows |
506 | | to the result, and to remove half-matched rows we do scan anyway |
507 | 2. -trunc* |
508 | | same as 1. |
509 | 3. in 1 and 2, +/- need not be on the same expr. level, |
510 | but can be on any upper level, as in +word +(trunc1* trunc2*) |
511 | 4. otherwise |
512 | | We have to index-search for this prefix. |
513 | | It may cause duplicates, as in the index (sorted by <word,docid>) |
514 | | <aaaa,row1> |
515 | | <aabb,row2> |
516 | | <aacc,row1> |
517 | | Searching for "aa*" will find row1 twice... |
518 | */ |
519 | FTB_EXPR *ftbe; |
520 | for (ftbe=(FTB_EXPR*)ftbw; |
521 | ftbe->up && !(ftbe->up->flags & FTB_FLAG_TRUNC); |
522 | ftbe->up->flags|= FTB_FLAG_TRUNC, ftbe=ftbe->up) |
523 | { |
524 | if (ftbe->flags & FTB_FLAG_NO || /* 2 */ |
525 | ftbe->up->ythresh - ftbe->up->yweaks > |
526 | (uint) MY_TEST(ftbe->flags & FTB_FLAG_YES)) /* 1 */ |
527 | { |
528 | FTB_EXPR *top_ftbe=ftbe->up; |
529 | ftbw->docid[0]=HA_OFFSET_ERROR; |
530 | for (ftbe=(FTB_EXPR *)ftbw; |
531 | ftbe != top_ftbe && !(ftbe->flags & FTB_FLAG_NO); |
532 | ftbe=ftbe->up) |
533 | ftbe->up->yweaks++; |
534 | ftbe=0; |
535 | break; |
536 | } |
537 | } |
538 | if (!ftbe) |
539 | continue; |
540 | /* 4 */ |
541 | if (!is_tree_inited(& ftb->no_dupes)) |
542 | init_tree(& ftb->no_dupes,0,0,sizeof(my_off_t), |
543 | _ftb_no_dupes_cmp,0,0,0); |
544 | else |
545 | reset_tree(& ftb->no_dupes); |
546 | } |
547 | |
548 | ftbw->off=0; /* in case of reinit */ |
549 | if (_ft2_search(ftb, ftbw, 1)) |
550 | return; |
551 | } |
552 | queue_fix(& ftb->queue); |
553 | } |
554 | |
555 | |
556 | FT_INFO * maria_ft_init_boolean_search(MARIA_HA *info, uint keynr, |
557 | uchar *query, uint query_len, |
558 | CHARSET_INFO *cs) |
559 | { |
560 | FTB *ftb; |
561 | FTB_EXPR *ftbe; |
562 | FTB_WORD *ftbw; |
563 | |
564 | if (!(ftb=(FTB *)my_malloc(sizeof(FTB), MYF(MY_WME)))) |
565 | return 0; |
566 | ftb->please= (struct _ft_vft *) & _ma_ft_vft_boolean; |
567 | ftb->state=UNINITIALIZED; |
568 | ftb->info=info; |
569 | ftb->keynr=keynr; |
570 | ftb->charset=cs; |
571 | DBUG_ASSERT(keynr==NO_SUCH_KEY || cs == info->s->keyinfo[keynr].seg->charset); |
572 | ftb->with_scan=0; |
573 | ftb->lastpos=HA_OFFSET_ERROR; |
574 | bzero(& ftb->no_dupes, sizeof(TREE)); |
575 | ftb->last_word= 0; |
576 | |
577 | init_alloc_root(&ftb->mem_root, "fulltext" , 1024, 1024, 0); |
578 | ftb->queue.max_elements= 0; |
579 | if (!(ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR)))) |
580 | goto err; |
581 | ftbe->weight=1; |
582 | ftbe->flags=FTB_FLAG_YES; |
583 | ftbe->nos=1; |
584 | ftbe->up=0; |
585 | ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0; |
586 | ftbe->docid[0]=ftbe->docid[1]=HA_OFFSET_ERROR; |
587 | ftbe->phrase= NULL; |
588 | ftbe->document= 0; |
589 | ftb->root=ftbe; |
590 | if (unlikely(_ftb_parse_query(ftb, query, query_len, |
591 | keynr == NO_SUCH_KEY ? &ft_default_parser : |
592 | info->s->keyinfo[keynr].parser))) |
593 | goto err; |
594 | /* |
595 | Hack: instead of init_queue, we'll use reinit queue to be able |
596 | to alloc queue with alloc_root() |
597 | */ |
598 | if (! (ftb->queue.root= (uchar **)alloc_root(&ftb->mem_root, |
599 | (ftb->queue.max_elements + 1) * |
600 | sizeof(void *)))) |
601 | goto err; |
602 | reinit_queue(&ftb->queue, ftb->queue.max_elements, 0, 0, |
603 | (int (*)(void*, uchar*, uchar*))FTB_WORD_cmp, 0, 0, 0); |
604 | for (ftbw= ftb->last_word; ftbw; ftbw= ftbw->prev) |
605 | queue_insert(&ftb->queue, (uchar *)ftbw); |
606 | ftb->list=(FTB_WORD **)alloc_root(&ftb->mem_root, |
607 | sizeof(FTB_WORD *)*ftb->queue.elements); |
608 | memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements); |
609 | my_qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *), |
610 | (qsort2_cmp)FTB_WORD_cmp_list, (void*) ftb->charset); |
611 | if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC; |
612 | ftb->state=READY; |
613 | return ftb; |
614 | err: |
615 | free_root(& ftb->mem_root, MYF(0)); |
616 | my_free(ftb); |
617 | return 0; |
618 | } |
619 | |
620 | |
621 | typedef struct st_my_ftb_phrase_param |
622 | { |
623 | LIST *phrase; |
624 | LIST *document; |
625 | CHARSET_INFO *cs; |
626 | uint phrase_length; |
627 | uint document_length; |
628 | uint match; |
629 | } MY_FTB_PHRASE_PARAM; |
630 | |
631 | |
632 | static int ftb_phrase_add_word(MYSQL_FTPARSER_PARAM *param, |
633 | const char *word, int word_len, |
634 | MYSQL_FTPARSER_BOOLEAN_INFO |
635 | *boolean_info __attribute__((unused))) |
636 | { |
637 | MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam; |
638 | FT_WORD *w= (FT_WORD *)phrase_param->document->data; |
639 | LIST *phrase, *document; |
640 | w->pos= (uchar*)word; |
641 | w->len= word_len; |
642 | phrase_param->document= phrase_param->document->prev; |
643 | if (phrase_param->phrase_length > phrase_param->document_length) |
644 | { |
645 | phrase_param->document_length++; |
646 | return 0; |
647 | } |
648 | /* TODO: rewrite phrase search to avoid |
649 | comparing the same word twice. */ |
650 | for (phrase= phrase_param->phrase, document= phrase_param->document->next; |
651 | phrase; phrase= phrase->next, document= document->next) |
652 | { |
653 | FT_WORD *phrase_word= (FT_WORD *)phrase->data; |
654 | FT_WORD *document_word= (FT_WORD *)document->data; |
655 | if (my_strnncoll(phrase_param->cs, (uchar*) phrase_word->pos, |
656 | phrase_word->len, |
657 | (uchar*) document_word->pos, document_word->len)) |
658 | return 0; |
659 | } |
660 | phrase_param->match++; |
661 | return 0; |
662 | } |
663 | |
664 | |
665 | static int ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM *param, |
666 | const char *document, int len) |
667 | { |
668 | FT_WORD word; |
669 | MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam; |
670 | const uchar *docend= (uchar*)document + len; |
671 | while (maria_ft_simple_get_word(phrase_param->cs, (uchar**)&document, |
672 | docend, &word, FALSE)) |
673 | { |
674 | param->mysql_add_word(param, (char*)word.pos, word.len, 0); |
675 | if (phrase_param->match) |
676 | break; |
677 | } |
678 | return 0; |
679 | } |
680 | |
681 | |
682 | /* |
683 | Checks if given buffer matches phrase list. |
684 | |
685 | SYNOPSIS |
686 | _ftb_check_phrase() |
687 | s0 start of buffer |
688 | e0 end of buffer |
689 | phrase broken into list phrase |
690 | cs charset info |
691 | |
692 | RETURN VALUE |
693 | 1 is returned if phrase found, 0 else. |
694 | -1 is returned if error occurs. |
695 | */ |
696 | |
697 | static int _ftb_check_phrase(FTB *ftb, const uchar *document, uint len, |
698 | FTB_EXPR *ftbe, struct st_mysql_ftparser *parser) |
699 | { |
700 | MY_FTB_PHRASE_PARAM ftb_param; |
701 | MYSQL_FTPARSER_PARAM *param; |
702 | DBUG_ENTER("_ftb_check_phrase" ); |
703 | DBUG_ASSERT(parser); |
704 | |
705 | if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 1))) |
706 | DBUG_RETURN(0); |
707 | ftb_param.phrase= ftbe->phrase; |
708 | ftb_param.document= ftbe->document; |
709 | ftb_param.cs= ftb->charset; |
710 | ftb_param.phrase_length= list_length(ftbe->phrase); |
711 | ftb_param.document_length= 1; |
712 | ftb_param.match= 0; |
713 | |
714 | param->mysql_parse= ftb_check_phrase_internal; |
715 | param->mysql_add_word= ftb_phrase_add_word; |
716 | param->mysql_ftparam= (void *)&ftb_param; |
717 | param->cs= ftb->charset; |
718 | param->doc= (char *)document; |
719 | param->length= len; |
720 | param->flags= 0; |
721 | param->mode= MYSQL_FTPARSER_WITH_STOPWORDS; |
722 | if (unlikely(parser->parse(param))) |
723 | return -1; |
724 | DBUG_RETURN(ftb_param.match ? 1 : 0); |
725 | } |
726 | |
727 | |
728 | static int _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig) |
729 | { |
730 | FT_SEG_ITERATOR ftsi; |
731 | FTB_EXPR *ftbe; |
732 | float weight=ftbw->weight; |
733 | int yn_flag= ftbw->flags, ythresh, mode=(ftsi_orig != 0); |
734 | my_off_t curdoc=ftbw->docid[mode]; |
735 | struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ? |
736 | &ft_default_parser : |
737 | ftb->info->s->keyinfo[ftb->keynr].parser; |
738 | |
739 | for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up) |
740 | { |
741 | ythresh = ftbe->ythresh - (mode ? 0 : ftbe->yweaks); |
742 | if (ftbe->docid[mode] != curdoc) |
743 | { |
744 | ftbe->cur_weight=0; |
745 | ftbe->yesses=ftbe->nos=0; |
746 | ftbe->docid[mode]=curdoc; |
747 | } |
748 | if (ftbe->nos) |
749 | break; |
750 | if (yn_flag & FTB_FLAG_YES) |
751 | { |
752 | weight /= ftbe->ythresh; |
753 | ftbe->cur_weight += weight; |
754 | if ((int) ++ftbe->yesses == ythresh) |
755 | { |
756 | yn_flag=ftbe->flags; |
757 | weight=ftbe->cur_weight*ftbe->weight; |
758 | if (mode && ftbe->phrase) |
759 | { |
760 | int found= 0; |
761 | |
762 | memcpy(&ftsi, ftsi_orig, sizeof(ftsi)); |
763 | while (_ma_ft_segiterator(&ftsi) && !found) |
764 | { |
765 | if (!ftsi.pos) |
766 | continue; |
767 | found= _ftb_check_phrase(ftb, ftsi.pos, ftsi.len, ftbe, parser); |
768 | if (unlikely(found < 0)) |
769 | return 1; |
770 | } |
771 | if (!found) |
772 | break; |
773 | } /* ftbe->quot */ |
774 | } |
775 | else |
776 | break; |
777 | } |
778 | else |
779 | if (yn_flag & FTB_FLAG_NO) |
780 | { |
781 | /* |
782 | NOTE: special sort function of queue assures that all |
783 | (yn_flag & FTB_FLAG_NO) != 0 |
784 | events for every particular subexpression will |
785 | "auto-magically" happen BEFORE all the |
786 | (yn_flag & FTB_FLAG_YES) != 0 events. So no |
787 | already matched expression can become not-matched again. |
788 | */ |
789 | ++ftbe->nos; |
790 | break; |
791 | } |
792 | else |
793 | { |
794 | if (ftbe->ythresh) |
795 | weight/=3; |
796 | ftbe->cur_weight += weight; |
797 | if ((int) ftbe->yesses < ythresh) |
798 | break; |
799 | if (!(yn_flag & FTB_FLAG_WONLY)) |
800 | yn_flag= ((int) ftbe->yesses++ == ythresh) ? ftbe->flags : FTB_FLAG_WONLY ; |
801 | weight*= ftbe->weight; |
802 | } |
803 | } |
804 | return 0; |
805 | } |
806 | |
807 | |
808 | int maria_ft_boolean_read_next(FT_INFO *ftb, char *record) |
809 | { |
810 | FTB_EXPR *ftbe; |
811 | FTB_WORD *ftbw; |
812 | MARIA_HA *info=ftb->info; |
813 | my_off_t curdoc; |
814 | |
815 | if (ftb->state != INDEX_SEARCH && ftb->state != INDEX_DONE) |
816 | return -1; |
817 | |
818 | /* black magic ON */ |
819 | if ((int) _ma_check_index(info, ftb->keynr) < 0) |
820 | return my_errno; |
821 | if (_ma_readinfo(info, F_RDLCK, 1)) |
822 | return my_errno; |
823 | /* black magic OFF */ |
824 | |
825 | if (!ftb->queue.elements) |
826 | return my_errno=HA_ERR_END_OF_FILE; |
827 | |
828 | /* Attention!!! Address of a local variable is used here! See err: label */ |
829 | ftb->queue.first_cmp_arg=(void *)&curdoc; |
830 | |
831 | while (ftb->state == INDEX_SEARCH && |
832 | (curdoc=((FTB_WORD *)queue_top(& ftb->queue))->docid[0]) != |
833 | HA_OFFSET_ERROR) |
834 | { |
835 | while (curdoc == (ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0]) |
836 | { |
837 | if (unlikely(_ftb_climb_the_tree(ftb, ftbw, 0))) |
838 | { |
839 | my_errno= HA_ERR_OUT_OF_MEM; |
840 | goto err; |
841 | } |
842 | |
843 | /* update queue */ |
844 | _ft2_search(ftb, ftbw, 0); |
845 | queue_replace_top(&ftb->queue); |
846 | } |
847 | |
848 | ftbe=ftb->root; |
849 | if (ftbe->docid[0]==curdoc && ftbe->cur_weight>0 && |
850 | ftbe->yesses>=(ftbe->ythresh-ftbe->yweaks) && !ftbe->nos) |
851 | { |
852 | /* curdoc matched ! */ |
853 | if (is_tree_inited(&ftb->no_dupes) && |
854 | tree_insert(&ftb->no_dupes, &curdoc, 0, |
855 | ftb->no_dupes.custom_arg)->count >1) |
856 | /* but it managed already to get past this line once */ |
857 | continue; |
858 | |
859 | info->cur_row.lastpos= curdoc; |
860 | /* Clear all states, except that the table was updated */ |
861 | info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); |
862 | |
863 | if (!(*info->read_record)(info, (uchar *) record, curdoc)) |
864 | { |
865 | info->update|= HA_STATE_AKTIV; /* Record is read */ |
866 | if (ftb->with_scan && |
867 | maria_ft_boolean_find_relevance(ftb, (uchar*)record, 0)==0) |
868 | continue; /* no match */ |
869 | my_errno=0; |
870 | goto err; |
871 | } |
872 | goto err; |
873 | } |
874 | } |
875 | ftb->state=INDEX_DONE; |
876 | my_errno=HA_ERR_END_OF_FILE; |
877 | err: |
878 | ftb->queue.first_cmp_arg=(void *)0; |
879 | return my_errno; |
880 | } |
881 | |
882 | |
883 | typedef struct st_my_ftb_find_param |
884 | { |
885 | FT_INFO *ftb; |
886 | FT_SEG_ITERATOR *ftsi; |
887 | } MY_FTB_FIND_PARAM; |
888 | |
889 | |
890 | static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param, |
891 | const char *word, int len, |
892 | MYSQL_FTPARSER_BOOLEAN_INFO |
893 | *boolean_info __attribute__((unused))) |
894 | { |
895 | MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam; |
896 | FT_INFO *ftb= ftb_param->ftb; |
897 | FTB_WORD *ftbw; |
898 | int a, b, c; |
899 | /* |
900 | Find right-most element in the array of query words matching this |
901 | word from a document. |
902 | */ |
903 | for (a= 0, b= ftb->queue.elements, c= (a+b)/2; b-a>1; c= (a+b)/2) |
904 | { |
905 | ftbw= ftb->list[c]; |
906 | if (ha_compare_text(ftb->charset, (uchar*)word, len, |
907 | (uchar*)ftbw->word+1, ftbw->len-1, |
908 | (my_bool)(ftbw->flags&FTB_FLAG_TRUNC)) < 0) |
909 | b= c; |
910 | else |
911 | a= c; |
912 | } |
913 | /* |
914 | If there were no words with truncation operator, we iterate to the |
915 | beginning of an array until array element is equal to the word from |
916 | a document. This is done mainly because the same word may be |
917 | mentioned twice (or more) in the query. |
918 | |
919 | In case query has words with truncation operator we must iterate |
920 | to the beginning of the array. There may be non-matching query words |
921 | between matching word with truncation operator and the right-most |
922 | matching element. E.g., if we're looking for 'aaa15' in an array of |
923 | 'aaa1* aaa14 aaa15 aaa16'. |
924 | |
925 | Worse of that there still may be match even if the binary search |
926 | above didn't find matching element. E.g., if we're looking for |
927 | 'aaa15' in an array of 'aaa1* aaa14 aaa16'. The binary search will |
928 | stop at 'aaa16'. |
929 | */ |
930 | for (; c >= 0; c--) |
931 | { |
932 | ftbw= ftb->list[c]; |
933 | if (ha_compare_text(ftb->charset, (uchar*)word, len, |
934 | (uchar*)ftbw->word + 1,ftbw->len - 1, |
935 | (my_bool)(ftbw->flags & FTB_FLAG_TRUNC))) |
936 | { |
937 | if (ftb->with_scan & FTB_FLAG_TRUNC) |
938 | continue; |
939 | else |
940 | break; |
941 | } |
942 | if (ftbw->docid[1] == ftb->info->cur_row.lastpos) |
943 | continue; |
944 | ftbw->docid[1]= ftb->info->cur_row.lastpos; |
945 | if (unlikely(_ftb_climb_the_tree(ftb, ftbw, ftb_param->ftsi))) |
946 | return 1; |
947 | } |
948 | return(0); |
949 | } |
950 | |
951 | |
952 | static int ftb_find_relevance_parse(MYSQL_FTPARSER_PARAM *param, |
953 | const char *doc, int len) |
954 | { |
955 | MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam; |
956 | FT_INFO *ftb= ftb_param->ftb; |
957 | uchar *end= (uchar*) doc + len; |
958 | FT_WORD w; |
959 | while (maria_ft_simple_get_word(ftb->charset, (uchar**)&doc, end, &w, TRUE)) |
960 | param->mysql_add_word(param, (char*)w.pos, w.len, 0); |
961 | return(0); |
962 | } |
963 | |
964 | |
965 | float maria_ft_boolean_find_relevance(FT_INFO *ftb, uchar *record, uint length) |
966 | { |
967 | FTB_EXPR *ftbe; |
968 | FT_SEG_ITERATOR ftsi, ftsi2; |
969 | MARIA_RECORD_POS docid= ftb->info->cur_row.lastpos; |
970 | MY_FTB_FIND_PARAM ftb_param; |
971 | MYSQL_FTPARSER_PARAM *param; |
972 | struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ? |
973 | &ft_default_parser : |
974 | ftb->info->s->keyinfo[ftb->keynr].parser; |
975 | |
976 | if (docid == HA_OFFSET_ERROR) |
977 | return -2.0; |
978 | if (!ftb->queue.elements) |
979 | return 0; |
980 | if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 0))) |
981 | return 0; |
982 | |
983 | if (ftb->state != INDEX_SEARCH && docid <= ftb->lastpos) |
984 | { |
985 | FTB_EXPR *x; |
986 | uint i; |
987 | |
988 | for (i=0; i < ftb->queue.elements; i++) |
989 | { |
990 | ftb->list[i]->docid[1]=HA_OFFSET_ERROR; |
991 | for (x=ftb->list[i]->up; x; x=x->up) |
992 | x->docid[1]=HA_OFFSET_ERROR; |
993 | } |
994 | } |
995 | |
996 | ftb->lastpos=docid; |
997 | |
998 | if (ftb->keynr==NO_SUCH_KEY) |
999 | _ma_ft_segiterator_dummy_init(record, length, &ftsi); |
1000 | else |
1001 | _ma_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi); |
1002 | memcpy(&ftsi2, &ftsi, sizeof(ftsi)); |
1003 | |
1004 | ftb_param.ftb= ftb; |
1005 | ftb_param.ftsi= &ftsi2; |
1006 | param->mysql_parse= ftb_find_relevance_parse; |
1007 | param->mysql_add_word= ftb_find_relevance_add_word; |
1008 | param->mysql_ftparam= (void *)&ftb_param; |
1009 | param->flags= 0; |
1010 | param->cs= ftb->charset; |
1011 | param->mode= MYSQL_FTPARSER_SIMPLE_MODE; |
1012 | |
1013 | while (_ma_ft_segiterator(&ftsi)) |
1014 | { |
1015 | if (!ftsi.pos) |
1016 | continue; |
1017 | param->doc= (char *)ftsi.pos; |
1018 | param->length= ftsi.len; |
1019 | if (unlikely(parser->parse(param))) |
1020 | return 0; |
1021 | } |
1022 | ftbe=ftb->root; |
1023 | if (ftbe->docid[1]==docid && ftbe->cur_weight>0 && |
1024 | ftbe->yesses>=ftbe->ythresh && !ftbe->nos) |
1025 | { /* row matched ! */ |
1026 | return ftbe->cur_weight; |
1027 | } |
1028 | else |
1029 | { /* match failed ! */ |
1030 | return 0.0; |
1031 | } |
1032 | } |
1033 | |
1034 | |
1035 | void maria_ft_boolean_close_search(FT_INFO *ftb) |
1036 | { |
1037 | if (is_tree_inited(& ftb->no_dupes)) |
1038 | { |
1039 | delete_tree(&ftb->no_dupes, 0); |
1040 | } |
1041 | free_root(& ftb->mem_root, MYF(0)); |
1042 | my_free(ftb); |
1043 | } |
1044 | |
1045 | |
1046 | float maria_ft_boolean_get_relevance(FT_INFO *ftb) |
1047 | { |
1048 | return ftb->root->cur_weight; |
1049 | } |
1050 | |
1051 | |
1052 | void maria_ft_boolean_reinit_search(FT_INFO *ftb) |
1053 | { |
1054 | _ftb_init_index_search(ftb); |
1055 | } |
1056 | |