1 | /* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. |
2 | |
3 | This program is free software; you can redistribute it and/or modify |
4 | it under the terms of the GNU General Public License as published by |
5 | the Free Software Foundation; version 2 of the License. |
6 | |
7 | This program is distributed in the hope that it will be useful, |
8 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
10 | GNU General Public License for more details. |
11 | |
12 | You should have received a copy of the GNU General Public License |
13 | along with this program; if not, write to the Free Software |
14 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ |
15 | |
16 | /* Written by Sergei A. Golubchik, who has a shared copyright to this code */ |
17 | |
18 | #include "ftdefs.h" |
19 | |
20 | typedef struct st_ft_docstat { |
21 | FT_WORD *list; |
22 | uint uniq; |
23 | double sum; |
24 | } FT_DOCSTAT; |
25 | |
26 | typedef struct st_my_ft_parser_param |
27 | { |
28 | TREE *wtree; |
29 | MEM_ROOT *mem_root; |
30 | } MY_FT_PARSER_PARAM; |
31 | |
32 | static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2) |
33 | { |
34 | return ha_compare_text(cs, (uchar*) w1->pos, w1->len, |
35 | (uchar*) w2->pos, w2->len, 0); |
36 | } |
37 | |
38 | static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat) |
39 | { |
40 | word->weight=LWS_IN_USE; |
41 | docstat->sum+=word->weight; |
42 | memcpy((docstat->list)++, word, sizeof(FT_WORD)); |
43 | return 0; |
44 | } |
45 | |
46 | /* transforms tree of words into the array, applying normalization */ |
47 | |
48 | FT_WORD * ft_linearize(TREE *wtree, MEM_ROOT *mem_root) |
49 | { |
50 | FT_WORD *wlist,*p; |
51 | FT_DOCSTAT docstat; |
52 | DBUG_ENTER("ft_linearize" ); |
53 | |
54 | if ((wlist=(FT_WORD *) alloc_root(mem_root, sizeof(FT_WORD)* |
55 | (1+wtree->elements_in_tree)))) |
56 | { |
57 | docstat.list=wlist; |
58 | docstat.uniq=wtree->elements_in_tree; |
59 | docstat.sum=0; |
60 | tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right); |
61 | } |
62 | delete_tree(wtree, 0); |
63 | if (!wlist) |
64 | DBUG_RETURN(NULL); |
65 | |
66 | docstat.list->pos=NULL; |
67 | |
68 | for (p=wlist;p->pos;p++) |
69 | { |
70 | p->weight=PRENORM_IN_USE; |
71 | } |
72 | |
73 | for (p=wlist;p->pos;p++) |
74 | { |
75 | p->weight/=NORM_IN_USE; |
76 | } |
77 | |
78 | DBUG_RETURN(wlist); |
79 | } |
80 | |
81 | my_bool ft_boolean_check_syntax_string(const uchar *str) |
82 | { |
83 | uint i, j; |
84 | |
85 | if (!str || |
86 | (strlen((char*) str)+1 != sizeof(DEFAULT_FTB_SYNTAX)) || |
87 | (str[0] != ' ' && str[1] != ' ')) |
88 | return 1; |
89 | for (i=0; i<sizeof(DEFAULT_FTB_SYNTAX); i++) |
90 | { |
91 | /* limiting to 7-bit ascii only */ |
92 | if ((unsigned char)(str[i]) > 127 || my_isalnum(default_charset_info, str[i])) |
93 | return 1; |
94 | for (j=0; j<i; j++) |
95 | if (str[i] == str[j] && (i != 11 || j != 10)) |
96 | return 1; |
97 | } |
98 | return 0; |
99 | } |
100 | |
101 | /* |
102 | RETURN VALUE |
103 | 0 - eof |
104 | 1 - word found |
105 | 2 - left bracket |
106 | 3 - right bracket |
107 | 4 - stopword found |
108 | */ |
109 | uchar ft_get_word(CHARSET_INFO *cs, const uchar **start, const uchar *end, |
110 | FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param) |
111 | { |
112 | const uchar *doc=*start; |
113 | int ctype; |
114 | uint mwc, length; |
115 | int mbl; |
116 | |
117 | param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); |
118 | param->weight_adjust= param->wasign= 0; |
119 | param->type= FT_TOKEN_EOF; |
120 | |
121 | while (doc<end) |
122 | { |
123 | for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) |
124 | { |
125 | mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); |
126 | if (true_word_char(ctype, *doc)) |
127 | break; |
128 | if (*doc == FTB_RQUOT && param->quot) |
129 | { |
130 | *start=doc+1; |
131 | param->type= FT_TOKEN_RIGHT_PAREN; |
132 | goto ret; |
133 | } |
134 | if (!param->quot) |
135 | { |
136 | if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT) |
137 | { |
138 | /* param->prev=' '; */ |
139 | *start=doc+1; |
140 | if (*doc == FTB_LQUOT) |
141 | param->quot= (char*) 1; |
142 | param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN); |
143 | goto ret; |
144 | } |
145 | if (param->prev == ' ') |
146 | { |
147 | if (*doc == FTB_YES ) { param->yesno=+1; continue; } else |
148 | if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else |
149 | if (*doc == FTB_NO ) { param->yesno=-1; continue; } else |
150 | if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else |
151 | if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else |
152 | if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; } |
153 | } |
154 | } |
155 | param->prev=*doc; |
156 | param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); |
157 | param->weight_adjust= param->wasign= 0; |
158 | } |
159 | |
160 | mwc=length=0; |
161 | for (word->pos= doc; doc < end; length++, |
162 | doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) |
163 | { |
164 | mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); |
165 | if (true_word_char(ctype, *doc)) |
166 | mwc=0; |
167 | else if (!misc_word_char(*doc) || mwc) |
168 | break; |
169 | else |
170 | mwc++; |
171 | } |
172 | param->prev='A'; /* be sure *prev is true_word_char */ |
173 | word->len= (uint)(doc-word->pos) - mwc; |
174 | if ((param->trunc=(doc<end && *doc == FTB_TRUNC))) |
175 | doc++; |
176 | |
177 | if (((length >= ft_min_word_len && !is_stopword((char*) word->pos, |
178 | word->len)) |
179 | || param->trunc) && length < ft_max_word_len) |
180 | { |
181 | *start=doc; |
182 | param->type= FT_TOKEN_WORD; |
183 | goto ret; |
184 | } |
185 | else if (length) /* make sure length > 0 (if start contains spaces only) */ |
186 | { |
187 | *start= doc; |
188 | param->type= FT_TOKEN_STOPWORD; |
189 | goto ret; |
190 | } |
191 | } |
192 | if (param->quot) |
193 | { |
194 | *start= doc; |
195 | param->type= 3; /* FT_RBR */ |
196 | goto ret; |
197 | } |
198 | ret: |
199 | return param->type; |
200 | } |
201 | |
202 | uchar ft_simple_get_word(CHARSET_INFO *cs, uchar **start, const uchar *end, |
203 | FT_WORD *word, my_bool skip_stopwords) |
204 | { |
205 | uchar *doc= *start; |
206 | uint mwc, length; |
207 | int mbl; |
208 | int ctype; |
209 | DBUG_ENTER("ft_simple_get_word" ); |
210 | |
211 | do |
212 | { |
213 | for (;; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) |
214 | { |
215 | if (doc >= end) |
216 | DBUG_RETURN(0); |
217 | mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); |
218 | if (true_word_char(ctype, *doc)) |
219 | break; |
220 | } |
221 | |
222 | mwc= length= 0; |
223 | for (word->pos= doc; doc < end; length++, |
224 | doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) |
225 | { |
226 | mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); |
227 | if (true_word_char(ctype, *doc)) |
228 | mwc= 0; |
229 | else if (!misc_word_char(*doc) || mwc) |
230 | break; |
231 | else |
232 | mwc++; |
233 | } |
234 | |
235 | word->len= (uint)(doc-word->pos) - mwc; |
236 | |
237 | if (skip_stopwords == FALSE || |
238 | (length >= ft_min_word_len && length < ft_max_word_len && |
239 | !is_stopword((char*) word->pos, word->len))) |
240 | { |
241 | *start= doc; |
242 | DBUG_RETURN(1); |
243 | } |
244 | } while (doc < end); |
245 | DBUG_RETURN(0); |
246 | } |
247 | |
248 | void ft_parse_init(TREE *wtree, CHARSET_INFO *cs) |
249 | { |
250 | DBUG_ENTER("ft_parse_init" ); |
251 | if (!is_tree_inited(wtree)) |
252 | init_tree(wtree, 0, 0, sizeof(FT_WORD), (qsort_cmp2)&FT_WORD_cmp, 0, |
253 | (void*)cs, MYF(0)); |
254 | DBUG_VOID_RETURN; |
255 | } |
256 | |
257 | |
258 | static int ft_add_word(MYSQL_FTPARSER_PARAM *param, |
259 | const char *word, int word_len, |
260 | MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused))) |
261 | { |
262 | TREE *wtree; |
263 | FT_WORD w; |
264 | MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam; |
265 | DBUG_ENTER("ft_add_word" ); |
266 | wtree= ft_param->wtree; |
267 | if (param->flags & MYSQL_FTFLAGS_NEED_COPY) |
268 | { |
269 | uchar *ptr; |
270 | DBUG_ASSERT(wtree->with_delete == 0); |
271 | ptr= (uchar *)alloc_root(ft_param->mem_root, word_len); |
272 | memcpy(ptr, word, word_len); |
273 | w.pos= ptr; |
274 | } |
275 | else |
276 | w.pos= (uchar*) word; |
277 | w.len= word_len; |
278 | if (!tree_insert(wtree, &w, 0, wtree->custom_arg)) |
279 | { |
280 | delete_tree(wtree, 0); |
281 | DBUG_RETURN(1); |
282 | } |
283 | DBUG_RETURN(0); |
284 | } |
285 | |
286 | |
287 | static int ft_parse_internal(MYSQL_FTPARSER_PARAM *param, |
288 | const char *doc_arg, int doc_len) |
289 | { |
290 | uchar *doc= (uchar*) doc_arg; |
291 | uchar *end= doc + doc_len; |
292 | MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam; |
293 | TREE *wtree= ft_param->wtree; |
294 | FT_WORD w; |
295 | DBUG_ENTER("ft_parse_internal" ); |
296 | |
297 | while (ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE)) |
298 | if (param->mysql_add_word(param, (char*) w.pos, (int)w.len, 0)) |
299 | DBUG_RETURN(1); |
300 | DBUG_RETURN(0); |
301 | } |
302 | |
303 | |
304 | int ft_parse(TREE *wtree, uchar *doc, int doclen, |
305 | struct st_mysql_ftparser *parser, |
306 | MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root) |
307 | { |
308 | MY_FT_PARSER_PARAM my_param; |
309 | DBUG_ENTER("ft_parse" ); |
310 | DBUG_ASSERT(parser); |
311 | |
312 | my_param.wtree= wtree; |
313 | my_param.mem_root= mem_root; |
314 | |
315 | param->mysql_parse= ft_parse_internal; |
316 | param->mysql_add_word= ft_add_word; |
317 | param->mysql_ftparam= &my_param; |
318 | param->cs= wtree->custom_arg; |
319 | param->doc= (char*) doc; |
320 | param->length= doclen; |
321 | param->mode= MYSQL_FTPARSER_SIMPLE_MODE; |
322 | DBUG_RETURN(parser->parse(param)); |
323 | } |
324 | |
325 | |
326 | #define MAX_PARAM_NR 2 |
327 | |
328 | MYSQL_FTPARSER_PARAM* ftparser_alloc_param(MI_INFO *info) |
329 | { |
330 | if (!info->ftparser_param) |
331 | { |
332 | /* |
333 | . info->ftparser_param can not be zero after the initialization, |
334 | because it always includes built-in fulltext parser. And built-in |
335 | parser can be called even if the table has no fulltext indexes and |
336 | no varchar/text fields. |
337 | |
338 | ftb_find_relevance... parser (ftb_find_relevance_parse, |
339 | ftb_find_relevance_add_word) calls ftb_check_phrase... parser |
340 | (ftb_check_phrase_internal, ftb_phrase_add_word). Thus MAX_PARAM_NR=2. |
341 | */ |
342 | info->ftparser_param= (MYSQL_FTPARSER_PARAM *) |
343 | my_malloc(MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) * |
344 | info->s->ftkeys, MYF(MY_WME | MY_ZEROFILL)); |
345 | init_alloc_root(&info->ft_memroot, "fulltext_parser" , |
346 | FTPARSER_MEMROOT_ALLOC_SIZE, 0, MYF(0)); |
347 | } |
348 | return info->ftparser_param; |
349 | } |
350 | |
351 | |
352 | MYSQL_FTPARSER_PARAM *ftparser_call_initializer(MI_INFO *info, |
353 | uint keynr, uint paramnr) |
354 | { |
355 | uint32 ftparser_nr; |
356 | struct st_mysql_ftparser *parser; |
357 | |
358 | if (!ftparser_alloc_param(info)) |
359 | return 0; |
360 | |
361 | if (keynr == NO_SUCH_KEY) |
362 | { |
363 | ftparser_nr= 0; |
364 | parser= &ft_default_parser; |
365 | } |
366 | else |
367 | { |
368 | ftparser_nr= info->s->keyinfo[keynr].ftkey_nr; |
369 | parser= info->s->keyinfo[keynr].parser; |
370 | } |
371 | DBUG_ASSERT(paramnr < MAX_PARAM_NR); |
372 | ftparser_nr= ftparser_nr*MAX_PARAM_NR + paramnr; |
373 | if (! info->ftparser_param[ftparser_nr].mysql_add_word) |
374 | { |
375 | /* Note, that mysql_add_word is used here as a flag: |
376 | mysql_add_word == 0 - parser is not initialized |
377 | mysql_add_word != 0 - parser is initialized, or no |
378 | initialization needed. */ |
379 | info->ftparser_param[ftparser_nr].mysql_add_word= |
380 | (int (*)(struct st_mysql_ftparser_param *, const char *, int, |
381 | MYSQL_FTPARSER_BOOLEAN_INFO *)) 1; |
382 | if (parser->init && parser->init(&info->ftparser_param[ftparser_nr])) |
383 | return 0; |
384 | } |
385 | return &info->ftparser_param[ftparser_nr]; |
386 | } |
387 | |
388 | void ftparser_call_deinitializer(MI_INFO *info) |
389 | { |
390 | uint i, j, keys= info->s->state.header.keys; |
391 | free_root(&info->ft_memroot, MYF(0)); |
392 | if (! info->ftparser_param) |
393 | return; |
394 | for (i= 0; i < keys; i++) |
395 | { |
396 | MI_KEYDEF *keyinfo= &info->s->keyinfo[i]; |
397 | for (j=0; j < MAX_PARAM_NR; j++) |
398 | { |
399 | MYSQL_FTPARSER_PARAM *ftparser_param= |
400 | &info->ftparser_param[keyinfo->ftkey_nr * MAX_PARAM_NR + j]; |
401 | if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word) |
402 | { |
403 | if (keyinfo->parser->deinit) |
404 | keyinfo->parser->deinit(ftparser_param); |
405 | ftparser_param->mysql_add_word= 0; |
406 | } |
407 | else |
408 | break; |
409 | } |
410 | } |
411 | } |
412 | |
413 | |