| 1 | /* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. |
| 2 | |
| 3 | This program is free software; you can redistribute it and/or modify |
| 4 | it under the terms of the GNU General Public License as published by |
| 5 | the Free Software Foundation; version 2 of the License. |
| 6 | |
| 7 | This program is distributed in the hope that it will be useful, |
| 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 10 | GNU General Public License for more details. |
| 11 | |
| 12 | You should have received a copy of the GNU General Public License |
| 13 | along with this program; if not, write to the Free Software |
| 14 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ |
| 15 | |
| 16 | /* Written by Sergei A. Golubchik, who has a shared copyright to this code */ |
| 17 | |
| 18 | #include "ftdefs.h" |
| 19 | |
| 20 | typedef struct st_ft_docstat { |
| 21 | FT_WORD *list; |
| 22 | uint uniq; |
| 23 | double sum; |
| 24 | } FT_DOCSTAT; |
| 25 | |
| 26 | typedef struct st_my_ft_parser_param |
| 27 | { |
| 28 | TREE *wtree; |
| 29 | MEM_ROOT *mem_root; |
| 30 | } MY_FT_PARSER_PARAM; |
| 31 | |
| 32 | static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2) |
| 33 | { |
| 34 | return ha_compare_text(cs, (uchar*) w1->pos, w1->len, |
| 35 | (uchar*) w2->pos, w2->len, 0); |
| 36 | } |
| 37 | |
| 38 | static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat) |
| 39 | { |
| 40 | word->weight=LWS_IN_USE; |
| 41 | docstat->sum+=word->weight; |
| 42 | memcpy((docstat->list)++, word, sizeof(FT_WORD)); |
| 43 | return 0; |
| 44 | } |
| 45 | |
| 46 | /* transforms tree of words into the array, applying normalization */ |
| 47 | |
| 48 | FT_WORD * ft_linearize(TREE *wtree, MEM_ROOT *mem_root) |
| 49 | { |
| 50 | FT_WORD *wlist,*p; |
| 51 | FT_DOCSTAT docstat; |
| 52 | DBUG_ENTER("ft_linearize" ); |
| 53 | |
| 54 | if ((wlist=(FT_WORD *) alloc_root(mem_root, sizeof(FT_WORD)* |
| 55 | (1+wtree->elements_in_tree)))) |
| 56 | { |
| 57 | docstat.list=wlist; |
| 58 | docstat.uniq=wtree->elements_in_tree; |
| 59 | docstat.sum=0; |
| 60 | tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right); |
| 61 | } |
| 62 | delete_tree(wtree, 0); |
| 63 | if (!wlist) |
| 64 | DBUG_RETURN(NULL); |
| 65 | |
| 66 | docstat.list->pos=NULL; |
| 67 | |
| 68 | for (p=wlist;p->pos;p++) |
| 69 | { |
| 70 | p->weight=PRENORM_IN_USE; |
| 71 | } |
| 72 | |
| 73 | for (p=wlist;p->pos;p++) |
| 74 | { |
| 75 | p->weight/=NORM_IN_USE; |
| 76 | } |
| 77 | |
| 78 | DBUG_RETURN(wlist); |
| 79 | } |
| 80 | |
| 81 | my_bool ft_boolean_check_syntax_string(const uchar *str) |
| 82 | { |
| 83 | uint i, j; |
| 84 | |
| 85 | if (!str || |
| 86 | (strlen((char*) str)+1 != sizeof(DEFAULT_FTB_SYNTAX)) || |
| 87 | (str[0] != ' ' && str[1] != ' ')) |
| 88 | return 1; |
| 89 | for (i=0; i<sizeof(DEFAULT_FTB_SYNTAX); i++) |
| 90 | { |
| 91 | /* limiting to 7-bit ascii only */ |
| 92 | if ((unsigned char)(str[i]) > 127 || my_isalnum(default_charset_info, str[i])) |
| 93 | return 1; |
| 94 | for (j=0; j<i; j++) |
| 95 | if (str[i] == str[j] && (i != 11 || j != 10)) |
| 96 | return 1; |
| 97 | } |
| 98 | return 0; |
| 99 | } |
| 100 | |
| 101 | /* |
| 102 | RETURN VALUE |
| 103 | 0 - eof |
| 104 | 1 - word found |
| 105 | 2 - left bracket |
| 106 | 3 - right bracket |
| 107 | 4 - stopword found |
| 108 | */ |
| 109 | uchar ft_get_word(CHARSET_INFO *cs, const uchar **start, const uchar *end, |
| 110 | FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param) |
| 111 | { |
| 112 | const uchar *doc=*start; |
| 113 | int ctype; |
| 114 | uint mwc, length; |
| 115 | int mbl; |
| 116 | |
| 117 | param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); |
| 118 | param->weight_adjust= param->wasign= 0; |
| 119 | param->type= FT_TOKEN_EOF; |
| 120 | |
| 121 | while (doc<end) |
| 122 | { |
| 123 | for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) |
| 124 | { |
| 125 | mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); |
| 126 | if (true_word_char(ctype, *doc)) |
| 127 | break; |
| 128 | if (*doc == FTB_RQUOT && param->quot) |
| 129 | { |
| 130 | *start=doc+1; |
| 131 | param->type= FT_TOKEN_RIGHT_PAREN; |
| 132 | goto ret; |
| 133 | } |
| 134 | if (!param->quot) |
| 135 | { |
| 136 | if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT) |
| 137 | { |
| 138 | /* param->prev=' '; */ |
| 139 | *start=doc+1; |
| 140 | if (*doc == FTB_LQUOT) |
| 141 | param->quot= (char*) 1; |
| 142 | param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN); |
| 143 | goto ret; |
| 144 | } |
| 145 | if (param->prev == ' ') |
| 146 | { |
| 147 | if (*doc == FTB_YES ) { param->yesno=+1; continue; } else |
| 148 | if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else |
| 149 | if (*doc == FTB_NO ) { param->yesno=-1; continue; } else |
| 150 | if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else |
| 151 | if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else |
| 152 | if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; } |
| 153 | } |
| 154 | } |
| 155 | param->prev=*doc; |
| 156 | param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); |
| 157 | param->weight_adjust= param->wasign= 0; |
| 158 | } |
| 159 | |
| 160 | mwc=length=0; |
| 161 | for (word->pos= doc; doc < end; length++, |
| 162 | doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) |
| 163 | { |
| 164 | mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); |
| 165 | if (true_word_char(ctype, *doc)) |
| 166 | mwc=0; |
| 167 | else if (!misc_word_char(*doc) || mwc) |
| 168 | break; |
| 169 | else |
| 170 | mwc++; |
| 171 | } |
| 172 | param->prev='A'; /* be sure *prev is true_word_char */ |
| 173 | word->len= (uint)(doc-word->pos) - mwc; |
| 174 | if ((param->trunc=(doc<end && *doc == FTB_TRUNC))) |
| 175 | doc++; |
| 176 | |
| 177 | if (((length >= ft_min_word_len && !is_stopword((char*) word->pos, |
| 178 | word->len)) |
| 179 | || param->trunc) && length < ft_max_word_len) |
| 180 | { |
| 181 | *start=doc; |
| 182 | param->type= FT_TOKEN_WORD; |
| 183 | goto ret; |
| 184 | } |
| 185 | else if (length) /* make sure length > 0 (if start contains spaces only) */ |
| 186 | { |
| 187 | *start= doc; |
| 188 | param->type= FT_TOKEN_STOPWORD; |
| 189 | goto ret; |
| 190 | } |
| 191 | } |
| 192 | if (param->quot) |
| 193 | { |
| 194 | *start= doc; |
| 195 | param->type= 3; /* FT_RBR */ |
| 196 | goto ret; |
| 197 | } |
| 198 | ret: |
| 199 | return param->type; |
| 200 | } |
| 201 | |
| 202 | uchar ft_simple_get_word(CHARSET_INFO *cs, uchar **start, const uchar *end, |
| 203 | FT_WORD *word, my_bool skip_stopwords) |
| 204 | { |
| 205 | uchar *doc= *start; |
| 206 | uint mwc, length; |
| 207 | int mbl; |
| 208 | int ctype; |
| 209 | DBUG_ENTER("ft_simple_get_word" ); |
| 210 | |
| 211 | do |
| 212 | { |
| 213 | for (;; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) |
| 214 | { |
| 215 | if (doc >= end) |
| 216 | DBUG_RETURN(0); |
| 217 | mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); |
| 218 | if (true_word_char(ctype, *doc)) |
| 219 | break; |
| 220 | } |
| 221 | |
| 222 | mwc= length= 0; |
| 223 | for (word->pos= doc; doc < end; length++, |
| 224 | doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) |
| 225 | { |
| 226 | mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end); |
| 227 | if (true_word_char(ctype, *doc)) |
| 228 | mwc= 0; |
| 229 | else if (!misc_word_char(*doc) || mwc) |
| 230 | break; |
| 231 | else |
| 232 | mwc++; |
| 233 | } |
| 234 | |
| 235 | word->len= (uint)(doc-word->pos) - mwc; |
| 236 | |
| 237 | if (skip_stopwords == FALSE || |
| 238 | (length >= ft_min_word_len && length < ft_max_word_len && |
| 239 | !is_stopword((char*) word->pos, word->len))) |
| 240 | { |
| 241 | *start= doc; |
| 242 | DBUG_RETURN(1); |
| 243 | } |
| 244 | } while (doc < end); |
| 245 | DBUG_RETURN(0); |
| 246 | } |
| 247 | |
| 248 | void ft_parse_init(TREE *wtree, CHARSET_INFO *cs) |
| 249 | { |
| 250 | DBUG_ENTER("ft_parse_init" ); |
| 251 | if (!is_tree_inited(wtree)) |
| 252 | init_tree(wtree, 0, 0, sizeof(FT_WORD), (qsort_cmp2)&FT_WORD_cmp, 0, |
| 253 | (void*)cs, MYF(0)); |
| 254 | DBUG_VOID_RETURN; |
| 255 | } |
| 256 | |
| 257 | |
| 258 | static int ft_add_word(MYSQL_FTPARSER_PARAM *param, |
| 259 | const char *word, int word_len, |
| 260 | MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused))) |
| 261 | { |
| 262 | TREE *wtree; |
| 263 | FT_WORD w; |
| 264 | MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam; |
| 265 | DBUG_ENTER("ft_add_word" ); |
| 266 | wtree= ft_param->wtree; |
| 267 | if (param->flags & MYSQL_FTFLAGS_NEED_COPY) |
| 268 | { |
| 269 | uchar *ptr; |
| 270 | DBUG_ASSERT(wtree->with_delete == 0); |
| 271 | ptr= (uchar *)alloc_root(ft_param->mem_root, word_len); |
| 272 | memcpy(ptr, word, word_len); |
| 273 | w.pos= ptr; |
| 274 | } |
| 275 | else |
| 276 | w.pos= (uchar*) word; |
| 277 | w.len= word_len; |
| 278 | if (!tree_insert(wtree, &w, 0, wtree->custom_arg)) |
| 279 | { |
| 280 | delete_tree(wtree, 0); |
| 281 | DBUG_RETURN(1); |
| 282 | } |
| 283 | DBUG_RETURN(0); |
| 284 | } |
| 285 | |
| 286 | |
| 287 | static int ft_parse_internal(MYSQL_FTPARSER_PARAM *param, |
| 288 | const char *doc_arg, int doc_len) |
| 289 | { |
| 290 | uchar *doc= (uchar*) doc_arg; |
| 291 | uchar *end= doc + doc_len; |
| 292 | MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam; |
| 293 | TREE *wtree= ft_param->wtree; |
| 294 | FT_WORD w; |
| 295 | DBUG_ENTER("ft_parse_internal" ); |
| 296 | |
| 297 | while (ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE)) |
| 298 | if (param->mysql_add_word(param, (char*) w.pos, (int)w.len, 0)) |
| 299 | DBUG_RETURN(1); |
| 300 | DBUG_RETURN(0); |
| 301 | } |
| 302 | |
| 303 | |
| 304 | int ft_parse(TREE *wtree, uchar *doc, int doclen, |
| 305 | struct st_mysql_ftparser *parser, |
| 306 | MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root) |
| 307 | { |
| 308 | MY_FT_PARSER_PARAM my_param; |
| 309 | DBUG_ENTER("ft_parse" ); |
| 310 | DBUG_ASSERT(parser); |
| 311 | |
| 312 | my_param.wtree= wtree; |
| 313 | my_param.mem_root= mem_root; |
| 314 | |
| 315 | param->mysql_parse= ft_parse_internal; |
| 316 | param->mysql_add_word= ft_add_word; |
| 317 | param->mysql_ftparam= &my_param; |
| 318 | param->cs= wtree->custom_arg; |
| 319 | param->doc= (char*) doc; |
| 320 | param->length= doclen; |
| 321 | param->mode= MYSQL_FTPARSER_SIMPLE_MODE; |
| 322 | DBUG_RETURN(parser->parse(param)); |
| 323 | } |
| 324 | |
| 325 | |
| 326 | #define MAX_PARAM_NR 2 |
| 327 | |
| 328 | MYSQL_FTPARSER_PARAM* ftparser_alloc_param(MI_INFO *info) |
| 329 | { |
| 330 | if (!info->ftparser_param) |
| 331 | { |
| 332 | /* |
| 333 | . info->ftparser_param can not be zero after the initialization, |
| 334 | because it always includes built-in fulltext parser. And built-in |
| 335 | parser can be called even if the table has no fulltext indexes and |
| 336 | no varchar/text fields. |
| 337 | |
| 338 | ftb_find_relevance... parser (ftb_find_relevance_parse, |
| 339 | ftb_find_relevance_add_word) calls ftb_check_phrase... parser |
| 340 | (ftb_check_phrase_internal, ftb_phrase_add_word). Thus MAX_PARAM_NR=2. |
| 341 | */ |
| 342 | info->ftparser_param= (MYSQL_FTPARSER_PARAM *) |
| 343 | my_malloc(MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) * |
| 344 | info->s->ftkeys, MYF(MY_WME | MY_ZEROFILL)); |
| 345 | init_alloc_root(&info->ft_memroot, "fulltext_parser" , |
| 346 | FTPARSER_MEMROOT_ALLOC_SIZE, 0, MYF(0)); |
| 347 | } |
| 348 | return info->ftparser_param; |
| 349 | } |
| 350 | |
| 351 | |
| 352 | MYSQL_FTPARSER_PARAM *ftparser_call_initializer(MI_INFO *info, |
| 353 | uint keynr, uint paramnr) |
| 354 | { |
| 355 | uint32 ftparser_nr; |
| 356 | struct st_mysql_ftparser *parser; |
| 357 | |
| 358 | if (!ftparser_alloc_param(info)) |
| 359 | return 0; |
| 360 | |
| 361 | if (keynr == NO_SUCH_KEY) |
| 362 | { |
| 363 | ftparser_nr= 0; |
| 364 | parser= &ft_default_parser; |
| 365 | } |
| 366 | else |
| 367 | { |
| 368 | ftparser_nr= info->s->keyinfo[keynr].ftkey_nr; |
| 369 | parser= info->s->keyinfo[keynr].parser; |
| 370 | } |
| 371 | DBUG_ASSERT(paramnr < MAX_PARAM_NR); |
| 372 | ftparser_nr= ftparser_nr*MAX_PARAM_NR + paramnr; |
| 373 | if (! info->ftparser_param[ftparser_nr].mysql_add_word) |
| 374 | { |
| 375 | /* Note, that mysql_add_word is used here as a flag: |
| 376 | mysql_add_word == 0 - parser is not initialized |
| 377 | mysql_add_word != 0 - parser is initialized, or no |
| 378 | initialization needed. */ |
| 379 | info->ftparser_param[ftparser_nr].mysql_add_word= |
| 380 | (int (*)(struct st_mysql_ftparser_param *, const char *, int, |
| 381 | MYSQL_FTPARSER_BOOLEAN_INFO *)) 1; |
| 382 | if (parser->init && parser->init(&info->ftparser_param[ftparser_nr])) |
| 383 | return 0; |
| 384 | } |
| 385 | return &info->ftparser_param[ftparser_nr]; |
| 386 | } |
| 387 | |
| 388 | void ftparser_call_deinitializer(MI_INFO *info) |
| 389 | { |
| 390 | uint i, j, keys= info->s->state.header.keys; |
| 391 | free_root(&info->ft_memroot, MYF(0)); |
| 392 | if (! info->ftparser_param) |
| 393 | return; |
| 394 | for (i= 0; i < keys; i++) |
| 395 | { |
| 396 | MI_KEYDEF *keyinfo= &info->s->keyinfo[i]; |
| 397 | for (j=0; j < MAX_PARAM_NR; j++) |
| 398 | { |
| 399 | MYSQL_FTPARSER_PARAM *ftparser_param= |
| 400 | &info->ftparser_param[keyinfo->ftkey_nr * MAX_PARAM_NR + j]; |
| 401 | if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word) |
| 402 | { |
| 403 | if (keyinfo->parser->deinit) |
| 404 | keyinfo->parser->deinit(ftparser_param); |
| 405 | ftparser_param->mysql_add_word= 0; |
| 406 | } |
| 407 | else |
| 408 | break; |
| 409 | } |
| 410 | } |
| 411 | } |
| 412 | |
| 413 | |