1 | /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB |
2 | |
3 | This program is free software; you can redistribute it and/or modify |
4 | it under the terms of the GNU General Public License as published by |
5 | the Free Software Foundation; version 2 of the License. |
6 | |
7 | This program is distributed in the hope that it will be useful, |
8 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
10 | GNU General Public License for more details. |
11 | |
12 | You should have received a copy of the GNU General Public License |
13 | along with this program; if not, write to the Free Software |
14 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */ |
15 | |
16 | /* Written by Sergei A. Golubchik, who has a shared copyright to this code */ |
17 | |
18 | #include "ma_ftdefs.h" |
19 | |
20 | typedef struct st_maria_ft_docstat { |
21 | FT_WORD *list; |
22 | uint uniq; |
23 | double sum; |
24 | } FT_DOCSTAT; |
25 | |
26 | |
27 | typedef struct st_my_maria_ft_parser_param |
28 | { |
29 | TREE *wtree; |
30 | MEM_ROOT *mem_root; |
31 | } MY_FT_PARSER_PARAM; |
32 | |
33 | |
34 | static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2) |
35 | { |
36 | return ha_compare_text(cs, (uchar*) w1->pos, w1->len, |
37 | (uchar*) w2->pos, w2->len, 0); |
38 | } |
39 | |
40 | static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat) |
41 | { |
42 | word->weight=LWS_IN_USE; |
43 | docstat->sum+=word->weight; |
44 | memcpy((docstat->list)++, word, sizeof(FT_WORD)); |
45 | return 0; |
46 | } |
47 | |
48 | /* transforms tree of words into the array, applying normalization */ |
49 | |
50 | FT_WORD * maria_ft_linearize(TREE *wtree, MEM_ROOT *mem_root) |
51 | { |
52 | FT_WORD *wlist,*p; |
53 | FT_DOCSTAT docstat; |
54 | DBUG_ENTER("maria_ft_linearize" ); |
55 | |
56 | if ((wlist=(FT_WORD *) alloc_root(mem_root, sizeof(FT_WORD)* |
57 | (1+wtree->elements_in_tree)))) |
58 | { |
59 | docstat.list=wlist; |
60 | docstat.uniq=wtree->elements_in_tree; |
61 | docstat.sum=0; |
62 | tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right); |
63 | } |
64 | delete_tree(wtree, 0); |
65 | if (!wlist) |
66 | DBUG_RETURN(NULL); |
67 | |
68 | docstat.list->pos=NULL; |
69 | |
70 | for (p=wlist;p->pos;p++) |
71 | { |
72 | p->weight=PRENORM_IN_USE; |
73 | } |
74 | |
75 | for (p=wlist;p->pos;p++) |
76 | { |
77 | p->weight/=NORM_IN_USE; |
78 | } |
79 | |
80 | DBUG_RETURN(wlist); |
81 | } |
82 | |
83 | my_bool maria_ft_boolean_check_syntax_string(const uchar *str) |
84 | { |
85 | uint i, j; |
86 | |
87 | if (!str || |
88 | (strlen((const char *) str) + 1 != sizeof(ft_boolean_syntax)) || |
89 | (str[0] != ' ' && str[1] != ' ')) |
90 | return 1; |
91 | for (i=0; i<sizeof(ft_boolean_syntax); i++) |
92 | { |
93 | /* limiting to 7-bit ascii only */ |
94 | if ((unsigned char)(str[i]) > 127 || |
95 | my_isalnum(default_charset_info, str[i])) |
96 | return 1; |
97 | for (j=0; j<i; j++) |
98 | if (str[i] == str[j] && (i != 11 || j != 10)) |
99 | return 1; |
100 | } |
101 | return 0; |
102 | } |
103 | |
104 | /* |
105 | RETURN VALUE |
106 | 0 - eof |
107 | 1 - word found |
108 | 2 - left bracket |
109 | 3 - right bracket |
110 | 4 - stopword found |
111 | */ |
112 | uchar maria_ft_get_word(CHARSET_INFO *cs, const uchar **start, |
113 | const uchar *end, |
114 | FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param) |
115 | { |
116 | const uchar *doc= *start; |
117 | int ctype; |
118 | uint mwc, length; |
119 | int mbl; |
120 | |
121 | param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); |
122 | param->weight_adjust= param->wasign= 0; |
123 | param->type= FT_TOKEN_EOF; |
124 | |
125 | while (doc<end) |
126 | { |
127 | for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) |
128 | { |
129 | mbl= cs->cset->ctype(cs, &ctype, doc, end); |
130 | if (true_word_char(ctype, *doc)) |
131 | break; |
132 | if (*doc == FTB_RQUOT && param->quot) |
133 | { |
134 | param->quot= (char *) doc; |
135 | *start=doc+1; |
136 | param->type= FT_TOKEN_RIGHT_PAREN; |
137 | goto ret; |
138 | } |
139 | if (!param->quot) |
140 | { |
141 | if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT) |
142 | { |
143 | /* param->prev=' '; */ |
144 | *start=doc+1; |
145 | if (*doc == FTB_LQUOT) |
146 | param->quot= (char *) *start; |
147 | param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN); |
148 | goto ret; |
149 | } |
150 | if (param->prev == ' ') |
151 | { |
152 | if (*doc == FTB_YES ) { param->yesno=+1; continue; } else |
153 | if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else |
154 | if (*doc == FTB_NO ) { param->yesno=-1; continue; } else |
155 | if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else |
156 | if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else |
157 | if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; } |
158 | } |
159 | } |
160 | param->prev=*doc; |
161 | param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); |
162 | param->weight_adjust= param->wasign= 0; |
163 | } |
164 | |
165 | mwc=length=0; |
166 | for (word->pos= doc; doc < end; length++, |
167 | doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) |
168 | { |
169 | mbl= cs->cset->ctype(cs, &ctype, doc, end); |
170 | if (true_word_char(ctype, *doc)) |
171 | mwc=0; |
172 | else if (!misc_word_char(*doc) || mwc) |
173 | break; |
174 | else |
175 | mwc++; |
176 | } |
177 | param->prev='A'; /* be sure *prev is true_word_char */ |
178 | word->len= (uint)(doc-word->pos) - mwc; |
179 | if ((param->trunc=(doc<end && *doc == FTB_TRUNC))) |
180 | doc++; |
181 | |
182 | if (((length >= ft_min_word_len && !is_stopword((char *) word->pos, |
183 | word->len)) |
184 | || param->trunc) && length < ft_max_word_len) |
185 | { |
186 | *start=doc; |
187 | param->type= FT_TOKEN_WORD; |
188 | goto ret; |
189 | } |
190 | else if (length) /* make sure length > 0 (if start contains spaces only) */ |
191 | { |
192 | *start= doc; |
193 | param->type= FT_TOKEN_STOPWORD; |
194 | goto ret; |
195 | } |
196 | } |
197 | if (param->quot) |
198 | { |
199 | param->quot= (char *)(*start= doc); |
200 | param->type= 3; /* FT_RBR */ |
201 | goto ret; |
202 | } |
203 | ret: |
204 | return param->type; |
205 | } |
206 | |
207 | uchar maria_ft_simple_get_word(CHARSET_INFO *cs, uchar **start, |
208 | const uchar *end, FT_WORD *word, |
209 | my_bool skip_stopwords) |
210 | { |
211 | uchar *doc= *start; |
212 | uint mwc, length; |
213 | int ctype, mbl; |
214 | DBUG_ENTER("maria_ft_simple_get_word" ); |
215 | |
216 | do |
217 | { |
218 | for (;; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) |
219 | { |
220 | if (doc >= end) |
221 | DBUG_RETURN(0); |
222 | mbl= cs->cset->ctype(cs, &ctype, doc, end); |
223 | if (true_word_char(ctype, *doc)) |
224 | break; |
225 | } |
226 | |
227 | mwc= length= 0; |
228 | for (word->pos= doc; doc < end; length++, |
229 | doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) |
230 | { |
231 | mbl= cs->cset->ctype(cs, &ctype, doc, end); |
232 | if (true_word_char(ctype, *doc)) |
233 | mwc= 0; |
234 | else if (!misc_word_char(*doc) || mwc) |
235 | break; |
236 | else |
237 | mwc++; |
238 | } |
239 | |
240 | word->len= (uint)(doc-word->pos) - mwc; |
241 | |
242 | if (skip_stopwords == FALSE || |
243 | (length >= ft_min_word_len && length < ft_max_word_len && |
244 | !is_stopword((char *) word->pos, word->len))) |
245 | { |
246 | *start= doc; |
247 | DBUG_RETURN(1); |
248 | } |
249 | } while (doc < end); |
250 | DBUG_RETURN(0); |
251 | } |
252 | |
253 | void maria_ft_parse_init(TREE *wtree, CHARSET_INFO *cs) |
254 | { |
255 | DBUG_ENTER("maria_ft_parse_init" ); |
256 | if (!is_tree_inited(wtree)) |
257 | init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp, NULL, |
258 | (void*) cs, MYF(0)); |
259 | DBUG_VOID_RETURN; |
260 | } |
261 | |
262 | |
263 | static int maria_ft_add_word(MYSQL_FTPARSER_PARAM *param, |
264 | const char *word, int word_len, |
265 | MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info |
266 | __attribute__((unused))) |
267 | { |
268 | TREE *wtree; |
269 | FT_WORD w; |
270 | MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam; |
271 | DBUG_ENTER("maria_ft_add_word" ); |
272 | wtree= ft_param->wtree; |
273 | if (param->flags & MYSQL_FTFLAGS_NEED_COPY) |
274 | { |
275 | uchar *ptr; |
276 | DBUG_ASSERT(wtree->with_delete == 0); |
277 | ptr= (uchar *)alloc_root(ft_param->mem_root, word_len); |
278 | memcpy(ptr, word, word_len); |
279 | w.pos= ptr; |
280 | } |
281 | else |
282 | w.pos= (uchar*) word; |
283 | w.len= word_len; |
284 | if (!tree_insert(wtree, &w, 0, wtree->custom_arg)) |
285 | { |
286 | delete_tree(wtree, 0); |
287 | DBUG_RETURN(1); |
288 | } |
289 | DBUG_RETURN(0); |
290 | } |
291 | |
292 | |
293 | static int maria_ft_parse_internal(MYSQL_FTPARSER_PARAM *param, |
294 | const char *doc_arg, |
295 | int doc_len) |
296 | { |
297 | uchar *doc= (uchar*) doc_arg; |
298 | uchar *end= doc + doc_len; |
299 | MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam; |
300 | TREE *wtree= ft_param->wtree; |
301 | FT_WORD w; |
302 | DBUG_ENTER("maria_ft_parse_internal" ); |
303 | |
304 | while (maria_ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE)) |
305 | if (param->mysql_add_word(param, (char*)w.pos, w.len, 0)) |
306 | DBUG_RETURN(1); |
307 | DBUG_RETURN(0); |
308 | } |
309 | |
310 | |
311 | int maria_ft_parse(TREE *wtree, uchar *doc, int doclen, |
312 | struct st_mysql_ftparser *parser, |
313 | MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root) |
314 | { |
315 | MY_FT_PARSER_PARAM my_param; |
316 | DBUG_ENTER("maria_ft_parse" ); |
317 | DBUG_ASSERT(parser); |
318 | my_param.wtree= wtree; |
319 | my_param.mem_root= mem_root; |
320 | |
321 | param->mysql_parse= maria_ft_parse_internal; |
322 | param->mysql_add_word= maria_ft_add_word; |
323 | param->mysql_ftparam= &my_param; |
324 | param->cs= wtree->custom_arg; |
325 | param->doc= (char*)doc; |
326 | param->length= doclen; |
327 | param->mode= MYSQL_FTPARSER_SIMPLE_MODE; |
328 | DBUG_RETURN(parser->parse(param)); |
329 | } |
330 | |
331 | |
332 | #define MAX_PARAM_NR 2 |
333 | |
334 | MYSQL_FTPARSER_PARAM* maria_ftparser_alloc_param(MARIA_HA *info) |
335 | { |
336 | if (!info->ftparser_param) |
337 | { |
338 | /* |
339 | . info->ftparser_param can not be zero after the initialization, |
340 | because it always includes built-in fulltext parser. And built-in |
341 | parser can be called even if the table has no fulltext indexes and |
342 | no varchar/text fields. |
343 | |
344 | ftb_find_relevance... parser (ftb_find_relevance_parse, |
345 | ftb_find_relevance_add_word) calls ftb_check_phrase... parser |
346 | (ftb_check_phrase_internal, ftb_phrase_add_word). Thus MAX_PARAM_NR=2. |
347 | */ |
348 | info->ftparser_param= (MYSQL_FTPARSER_PARAM *) |
349 | my_malloc(MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) * |
350 | info->s->ftkeys, MYF(MY_WME | MY_ZEROFILL)); |
351 | init_alloc_root(&info->ft_memroot, "fulltext_parser" , |
352 | FTPARSER_MEMROOT_ALLOC_SIZE, 0, MYF(0)); |
353 | } |
354 | return info->ftparser_param; |
355 | } |
356 | |
357 | |
358 | MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info, |
359 | uint keynr, uint paramnr) |
360 | { |
361 | uint32 ftparser_nr; |
362 | struct st_mysql_ftparser *parser; |
363 | |
364 | if (!maria_ftparser_alloc_param(info)) |
365 | return 0; |
366 | |
367 | if (keynr == NO_SUCH_KEY) |
368 | { |
369 | ftparser_nr= 0; |
370 | parser= &ft_default_parser; |
371 | } |
372 | else |
373 | { |
374 | ftparser_nr= info->s->keyinfo[keynr].ftkey_nr; |
375 | parser= info->s->keyinfo[keynr].parser; |
376 | } |
377 | DBUG_ASSERT(paramnr < MAX_PARAM_NR); |
378 | ftparser_nr= ftparser_nr*MAX_PARAM_NR + paramnr; |
379 | if (! info->ftparser_param[ftparser_nr].mysql_add_word) |
380 | { |
381 | /* Note, that mysql_add_word is used here as a flag: |
382 | mysql_add_word == 0 - parser is not initialized |
383 | mysql_add_word != 0 - parser is initialized, or no |
384 | initialization needed. */ |
385 | info->ftparser_param[ftparser_nr].mysql_add_word= |
386 | (int (*)(struct st_mysql_ftparser_param *, const char *, |
387 | int, MYSQL_FTPARSER_BOOLEAN_INFO *)) 1; |
388 | if (parser->init && parser->init(&info->ftparser_param[ftparser_nr])) |
389 | return 0; |
390 | } |
391 | return &info->ftparser_param[ftparser_nr]; |
392 | } |
393 | |
394 | |
395 | void maria_ftparser_call_deinitializer(MARIA_HA *info) |
396 | { |
397 | uint i, j, keys= info->s->state.header.keys; |
398 | free_root(&info->ft_memroot, MYF(0)); |
399 | if (! info->ftparser_param) |
400 | return; |
401 | for (i= 0; i < keys; i++) |
402 | { |
403 | MARIA_KEYDEF *keyinfo= &info->s->keyinfo[i]; |
404 | for (j=0; j < MAX_PARAM_NR; j++) |
405 | { |
406 | MYSQL_FTPARSER_PARAM *ftparser_param= |
407 | &info->ftparser_param[keyinfo->ftkey_nr*MAX_PARAM_NR + j]; |
408 | if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word) |
409 | { |
410 | if (keyinfo->parser->deinit) |
411 | keyinfo->parser->deinit(ftparser_param); |
412 | ftparser_param->mysql_add_word= 0; |
413 | } |
414 | else |
415 | break; |
416 | } |
417 | } |
418 | } |
419 | |