| 1 | /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB |
| 2 | |
| 3 | This program is free software; you can redistribute it and/or modify |
| 4 | it under the terms of the GNU General Public License as published by |
| 5 | the Free Software Foundation; version 2 of the License. |
| 6 | |
| 7 | This program is distributed in the hope that it will be useful, |
| 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 10 | GNU General Public License for more details. |
| 11 | |
| 12 | You should have received a copy of the GNU General Public License |
| 13 | along with this program; if not, write to the Free Software |
| 14 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */ |
| 15 | |
| 16 | /* Written by Sergei A. Golubchik, who has a shared copyright to this code */ |
| 17 | |
| 18 | #include "ma_ftdefs.h" |
| 19 | |
| 20 | typedef struct st_maria_ft_docstat { |
| 21 | FT_WORD *list; |
| 22 | uint uniq; |
| 23 | double sum; |
| 24 | } FT_DOCSTAT; |
| 25 | |
| 26 | |
| 27 | typedef struct st_my_maria_ft_parser_param |
| 28 | { |
| 29 | TREE *wtree; |
| 30 | MEM_ROOT *mem_root; |
| 31 | } MY_FT_PARSER_PARAM; |
| 32 | |
| 33 | |
| 34 | static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2) |
| 35 | { |
| 36 | return ha_compare_text(cs, (uchar*) w1->pos, w1->len, |
| 37 | (uchar*) w2->pos, w2->len, 0); |
| 38 | } |
| 39 | |
| 40 | static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat) |
| 41 | { |
| 42 | word->weight=LWS_IN_USE; |
| 43 | docstat->sum+=word->weight; |
| 44 | memcpy((docstat->list)++, word, sizeof(FT_WORD)); |
| 45 | return 0; |
| 46 | } |
| 47 | |
| 48 | /* transforms tree of words into the array, applying normalization */ |
| 49 | |
| 50 | FT_WORD * maria_ft_linearize(TREE *wtree, MEM_ROOT *mem_root) |
| 51 | { |
| 52 | FT_WORD *wlist,*p; |
| 53 | FT_DOCSTAT docstat; |
| 54 | DBUG_ENTER("maria_ft_linearize" ); |
| 55 | |
| 56 | if ((wlist=(FT_WORD *) alloc_root(mem_root, sizeof(FT_WORD)* |
| 57 | (1+wtree->elements_in_tree)))) |
| 58 | { |
| 59 | docstat.list=wlist; |
| 60 | docstat.uniq=wtree->elements_in_tree; |
| 61 | docstat.sum=0; |
| 62 | tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right); |
| 63 | } |
| 64 | delete_tree(wtree, 0); |
| 65 | if (!wlist) |
| 66 | DBUG_RETURN(NULL); |
| 67 | |
| 68 | docstat.list->pos=NULL; |
| 69 | |
| 70 | for (p=wlist;p->pos;p++) |
| 71 | { |
| 72 | p->weight=PRENORM_IN_USE; |
| 73 | } |
| 74 | |
| 75 | for (p=wlist;p->pos;p++) |
| 76 | { |
| 77 | p->weight/=NORM_IN_USE; |
| 78 | } |
| 79 | |
| 80 | DBUG_RETURN(wlist); |
| 81 | } |
| 82 | |
| 83 | my_bool maria_ft_boolean_check_syntax_string(const uchar *str) |
| 84 | { |
| 85 | uint i, j; |
| 86 | |
| 87 | if (!str || |
| 88 | (strlen((const char *) str) + 1 != sizeof(ft_boolean_syntax)) || |
| 89 | (str[0] != ' ' && str[1] != ' ')) |
| 90 | return 1; |
| 91 | for (i=0; i<sizeof(ft_boolean_syntax); i++) |
| 92 | { |
| 93 | /* limiting to 7-bit ascii only */ |
| 94 | if ((unsigned char)(str[i]) > 127 || |
| 95 | my_isalnum(default_charset_info, str[i])) |
| 96 | return 1; |
| 97 | for (j=0; j<i; j++) |
| 98 | if (str[i] == str[j] && (i != 11 || j != 10)) |
| 99 | return 1; |
| 100 | } |
| 101 | return 0; |
| 102 | } |
| 103 | |
| 104 | /* |
| 105 | RETURN VALUE |
| 106 | 0 - eof |
| 107 | 1 - word found |
| 108 | 2 - left bracket |
| 109 | 3 - right bracket |
| 110 | 4 - stopword found |
| 111 | */ |
| 112 | uchar maria_ft_get_word(CHARSET_INFO *cs, const uchar **start, |
| 113 | const uchar *end, |
| 114 | FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param) |
| 115 | { |
| 116 | const uchar *doc= *start; |
| 117 | int ctype; |
| 118 | uint mwc, length; |
| 119 | int mbl; |
| 120 | |
| 121 | param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); |
| 122 | param->weight_adjust= param->wasign= 0; |
| 123 | param->type= FT_TOKEN_EOF; |
| 124 | |
| 125 | while (doc<end) |
| 126 | { |
| 127 | for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) |
| 128 | { |
| 129 | mbl= cs->cset->ctype(cs, &ctype, doc, end); |
| 130 | if (true_word_char(ctype, *doc)) |
| 131 | break; |
| 132 | if (*doc == FTB_RQUOT && param->quot) |
| 133 | { |
| 134 | param->quot= (char *) doc; |
| 135 | *start=doc+1; |
| 136 | param->type= FT_TOKEN_RIGHT_PAREN; |
| 137 | goto ret; |
| 138 | } |
| 139 | if (!param->quot) |
| 140 | { |
| 141 | if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT) |
| 142 | { |
| 143 | /* param->prev=' '; */ |
| 144 | *start=doc+1; |
| 145 | if (*doc == FTB_LQUOT) |
| 146 | param->quot= (char *) *start; |
| 147 | param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN); |
| 148 | goto ret; |
| 149 | } |
| 150 | if (param->prev == ' ') |
| 151 | { |
| 152 | if (*doc == FTB_YES ) { param->yesno=+1; continue; } else |
| 153 | if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else |
| 154 | if (*doc == FTB_NO ) { param->yesno=-1; continue; } else |
| 155 | if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else |
| 156 | if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else |
| 157 | if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; } |
| 158 | } |
| 159 | } |
| 160 | param->prev=*doc; |
| 161 | param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); |
| 162 | param->weight_adjust= param->wasign= 0; |
| 163 | } |
| 164 | |
| 165 | mwc=length=0; |
| 166 | for (word->pos= doc; doc < end; length++, |
| 167 | doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) |
| 168 | { |
| 169 | mbl= cs->cset->ctype(cs, &ctype, doc, end); |
| 170 | if (true_word_char(ctype, *doc)) |
| 171 | mwc=0; |
| 172 | else if (!misc_word_char(*doc) || mwc) |
| 173 | break; |
| 174 | else |
| 175 | mwc++; |
| 176 | } |
| 177 | param->prev='A'; /* be sure *prev is true_word_char */ |
| 178 | word->len= (uint)(doc-word->pos) - mwc; |
| 179 | if ((param->trunc=(doc<end && *doc == FTB_TRUNC))) |
| 180 | doc++; |
| 181 | |
| 182 | if (((length >= ft_min_word_len && !is_stopword((char *) word->pos, |
| 183 | word->len)) |
| 184 | || param->trunc) && length < ft_max_word_len) |
| 185 | { |
| 186 | *start=doc; |
| 187 | param->type= FT_TOKEN_WORD; |
| 188 | goto ret; |
| 189 | } |
| 190 | else if (length) /* make sure length > 0 (if start contains spaces only) */ |
| 191 | { |
| 192 | *start= doc; |
| 193 | param->type= FT_TOKEN_STOPWORD; |
| 194 | goto ret; |
| 195 | } |
| 196 | } |
| 197 | if (param->quot) |
| 198 | { |
| 199 | param->quot= (char *)(*start= doc); |
| 200 | param->type= 3; /* FT_RBR */ |
| 201 | goto ret; |
| 202 | } |
| 203 | ret: |
| 204 | return param->type; |
| 205 | } |
| 206 | |
| 207 | uchar maria_ft_simple_get_word(CHARSET_INFO *cs, uchar **start, |
| 208 | const uchar *end, FT_WORD *word, |
| 209 | my_bool skip_stopwords) |
| 210 | { |
| 211 | uchar *doc= *start; |
| 212 | uint mwc, length; |
| 213 | int ctype, mbl; |
| 214 | DBUG_ENTER("maria_ft_simple_get_word" ); |
| 215 | |
| 216 | do |
| 217 | { |
| 218 | for (;; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) |
| 219 | { |
| 220 | if (doc >= end) |
| 221 | DBUG_RETURN(0); |
| 222 | mbl= cs->cset->ctype(cs, &ctype, doc, end); |
| 223 | if (true_word_char(ctype, *doc)) |
| 224 | break; |
| 225 | } |
| 226 | |
| 227 | mwc= length= 0; |
| 228 | for (word->pos= doc; doc < end; length++, |
| 229 | doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) |
| 230 | { |
| 231 | mbl= cs->cset->ctype(cs, &ctype, doc, end); |
| 232 | if (true_word_char(ctype, *doc)) |
| 233 | mwc= 0; |
| 234 | else if (!misc_word_char(*doc) || mwc) |
| 235 | break; |
| 236 | else |
| 237 | mwc++; |
| 238 | } |
| 239 | |
| 240 | word->len= (uint)(doc-word->pos) - mwc; |
| 241 | |
| 242 | if (skip_stopwords == FALSE || |
| 243 | (length >= ft_min_word_len && length < ft_max_word_len && |
| 244 | !is_stopword((char *) word->pos, word->len))) |
| 245 | { |
| 246 | *start= doc; |
| 247 | DBUG_RETURN(1); |
| 248 | } |
| 249 | } while (doc < end); |
| 250 | DBUG_RETURN(0); |
| 251 | } |
| 252 | |
| 253 | void maria_ft_parse_init(TREE *wtree, CHARSET_INFO *cs) |
| 254 | { |
| 255 | DBUG_ENTER("maria_ft_parse_init" ); |
| 256 | if (!is_tree_inited(wtree)) |
| 257 | init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp, NULL, |
| 258 | (void*) cs, MYF(0)); |
| 259 | DBUG_VOID_RETURN; |
| 260 | } |
| 261 | |
| 262 | |
| 263 | static int maria_ft_add_word(MYSQL_FTPARSER_PARAM *param, |
| 264 | const char *word, int word_len, |
| 265 | MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info |
| 266 | __attribute__((unused))) |
| 267 | { |
| 268 | TREE *wtree; |
| 269 | FT_WORD w; |
| 270 | MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam; |
| 271 | DBUG_ENTER("maria_ft_add_word" ); |
| 272 | wtree= ft_param->wtree; |
| 273 | if (param->flags & MYSQL_FTFLAGS_NEED_COPY) |
| 274 | { |
| 275 | uchar *ptr; |
| 276 | DBUG_ASSERT(wtree->with_delete == 0); |
| 277 | ptr= (uchar *)alloc_root(ft_param->mem_root, word_len); |
| 278 | memcpy(ptr, word, word_len); |
| 279 | w.pos= ptr; |
| 280 | } |
| 281 | else |
| 282 | w.pos= (uchar*) word; |
| 283 | w.len= word_len; |
| 284 | if (!tree_insert(wtree, &w, 0, wtree->custom_arg)) |
| 285 | { |
| 286 | delete_tree(wtree, 0); |
| 287 | DBUG_RETURN(1); |
| 288 | } |
| 289 | DBUG_RETURN(0); |
| 290 | } |
| 291 | |
| 292 | |
| 293 | static int maria_ft_parse_internal(MYSQL_FTPARSER_PARAM *param, |
| 294 | const char *doc_arg, |
| 295 | int doc_len) |
| 296 | { |
| 297 | uchar *doc= (uchar*) doc_arg; |
| 298 | uchar *end= doc + doc_len; |
| 299 | MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam; |
| 300 | TREE *wtree= ft_param->wtree; |
| 301 | FT_WORD w; |
| 302 | DBUG_ENTER("maria_ft_parse_internal" ); |
| 303 | |
| 304 | while (maria_ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE)) |
| 305 | if (param->mysql_add_word(param, (char*)w.pos, w.len, 0)) |
| 306 | DBUG_RETURN(1); |
| 307 | DBUG_RETURN(0); |
| 308 | } |
| 309 | |
| 310 | |
| 311 | int maria_ft_parse(TREE *wtree, uchar *doc, int doclen, |
| 312 | struct st_mysql_ftparser *parser, |
| 313 | MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root) |
| 314 | { |
| 315 | MY_FT_PARSER_PARAM my_param; |
| 316 | DBUG_ENTER("maria_ft_parse" ); |
| 317 | DBUG_ASSERT(parser); |
| 318 | my_param.wtree= wtree; |
| 319 | my_param.mem_root= mem_root; |
| 320 | |
| 321 | param->mysql_parse= maria_ft_parse_internal; |
| 322 | param->mysql_add_word= maria_ft_add_word; |
| 323 | param->mysql_ftparam= &my_param; |
| 324 | param->cs= wtree->custom_arg; |
| 325 | param->doc= (char*)doc; |
| 326 | param->length= doclen; |
| 327 | param->mode= MYSQL_FTPARSER_SIMPLE_MODE; |
| 328 | DBUG_RETURN(parser->parse(param)); |
| 329 | } |
| 330 | |
| 331 | |
| 332 | #define MAX_PARAM_NR 2 |
| 333 | |
| 334 | MYSQL_FTPARSER_PARAM* maria_ftparser_alloc_param(MARIA_HA *info) |
| 335 | { |
| 336 | if (!info->ftparser_param) |
| 337 | { |
| 338 | /* |
| 339 | . info->ftparser_param can not be zero after the initialization, |
| 340 | because it always includes built-in fulltext parser. And built-in |
| 341 | parser can be called even if the table has no fulltext indexes and |
| 342 | no varchar/text fields. |
| 343 | |
| 344 | ftb_find_relevance... parser (ftb_find_relevance_parse, |
| 345 | ftb_find_relevance_add_word) calls ftb_check_phrase... parser |
| 346 | (ftb_check_phrase_internal, ftb_phrase_add_word). Thus MAX_PARAM_NR=2. |
| 347 | */ |
| 348 | info->ftparser_param= (MYSQL_FTPARSER_PARAM *) |
| 349 | my_malloc(MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) * |
| 350 | info->s->ftkeys, MYF(MY_WME | MY_ZEROFILL)); |
| 351 | init_alloc_root(&info->ft_memroot, "fulltext_parser" , |
| 352 | FTPARSER_MEMROOT_ALLOC_SIZE, 0, MYF(0)); |
| 353 | } |
| 354 | return info->ftparser_param; |
| 355 | } |
| 356 | |
| 357 | |
| 358 | MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info, |
| 359 | uint keynr, uint paramnr) |
| 360 | { |
| 361 | uint32 ftparser_nr; |
| 362 | struct st_mysql_ftparser *parser; |
| 363 | |
| 364 | if (!maria_ftparser_alloc_param(info)) |
| 365 | return 0; |
| 366 | |
| 367 | if (keynr == NO_SUCH_KEY) |
| 368 | { |
| 369 | ftparser_nr= 0; |
| 370 | parser= &ft_default_parser; |
| 371 | } |
| 372 | else |
| 373 | { |
| 374 | ftparser_nr= info->s->keyinfo[keynr].ftkey_nr; |
| 375 | parser= info->s->keyinfo[keynr].parser; |
| 376 | } |
| 377 | DBUG_ASSERT(paramnr < MAX_PARAM_NR); |
| 378 | ftparser_nr= ftparser_nr*MAX_PARAM_NR + paramnr; |
| 379 | if (! info->ftparser_param[ftparser_nr].mysql_add_word) |
| 380 | { |
| 381 | /* Note, that mysql_add_word is used here as a flag: |
| 382 | mysql_add_word == 0 - parser is not initialized |
| 383 | mysql_add_word != 0 - parser is initialized, or no |
| 384 | initialization needed. */ |
| 385 | info->ftparser_param[ftparser_nr].mysql_add_word= |
| 386 | (int (*)(struct st_mysql_ftparser_param *, const char *, |
| 387 | int, MYSQL_FTPARSER_BOOLEAN_INFO *)) 1; |
| 388 | if (parser->init && parser->init(&info->ftparser_param[ftparser_nr])) |
| 389 | return 0; |
| 390 | } |
| 391 | return &info->ftparser_param[ftparser_nr]; |
| 392 | } |
| 393 | |
| 394 | |
| 395 | void maria_ftparser_call_deinitializer(MARIA_HA *info) |
| 396 | { |
| 397 | uint i, j, keys= info->s->state.header.keys; |
| 398 | free_root(&info->ft_memroot, MYF(0)); |
| 399 | if (! info->ftparser_param) |
| 400 | return; |
| 401 | for (i= 0; i < keys; i++) |
| 402 | { |
| 403 | MARIA_KEYDEF *keyinfo= &info->s->keyinfo[i]; |
| 404 | for (j=0; j < MAX_PARAM_NR; j++) |
| 405 | { |
| 406 | MYSQL_FTPARSER_PARAM *ftparser_param= |
| 407 | &info->ftparser_param[keyinfo->ftkey_nr*MAX_PARAM_NR + j]; |
| 408 | if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word) |
| 409 | { |
| 410 | if (keyinfo->parser->deinit) |
| 411 | keyinfo->parser->deinit(ftparser_param); |
| 412 | ftparser_param->mysql_add_word= 0; |
| 413 | } |
| 414 | else |
| 415 | break; |
| 416 | } |
| 417 | } |
| 418 | } |
| 419 | |