| 1 | /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB | 
| 2 |  | 
| 3 |    This program is free software; you can redistribute it and/or modify | 
| 4 |    it under the terms of the GNU General Public License as published by | 
| 5 |    the Free Software Foundation; version 2 of the License. | 
| 6 |  | 
| 7 |    This program is distributed in the hope that it will be useful, | 
| 8 |    but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 9 |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
| 10 |    GNU General Public License for more details. | 
| 11 |  | 
| 12 |    You should have received a copy of the GNU General Public License | 
| 13 |    along with this program; if not, write to the Free Software | 
| 14 |    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */ | 
| 15 |  | 
| 16 | /* Written by Sergei A. Golubchik, who has a shared copyright to this code */ | 
| 17 |  | 
| 18 | /* some definitions for full-text indices */ | 
| 19 |  | 
| 20 | #include "ma_fulltext.h" | 
| 21 | #include <m_ctype.h> | 
| 22 | #include <my_tree.h> | 
| 23 | #include <queues.h> | 
| 24 | #include <mysql/plugin.h> | 
| 25 |  | 
| 26 | #define true_word_char(ctype, character) \ | 
| 27 |                       ((ctype) & (_MY_U | _MY_L | _MY_NMR) || \ | 
| 28 |                        (character) == '_') | 
| 29 | #define misc_word_char(X)	0 | 
| 30 |  | 
| 31 | #define FT_MAX_WORD_LEN_FOR_SORT 31 | 
| 32 |  | 
| 33 | #define FTPARSER_MEMROOT_ALLOC_SIZE 65536 | 
| 34 |  | 
| 35 | #define COMPILE_STOPWORDS_IN | 
| 36 |  | 
| 37 | /* Interested readers may consult SMART | 
| 38 |    (ftp://ftp.cs.cornell.edu/pub/smart/smart.11.0.tar.Z) | 
| 39 |    for an excellent implementation of vector space model we use. | 
| 40 |    It also demonstrate the usage of different weghting techniques. | 
| 41 |    This code, though, is completely original and is not based on the | 
| 42 |    SMART code but was in some cases inspired by it. | 
| 43 |  | 
| 44 |    NORM_PIVOT was taken from the article | 
| 45 |    A.Singhal, C.Buckley, M.Mitra, "Pivoted Document Length Normalization", | 
| 46 |    ACM SIGIR'96, 21-29, 1996 | 
| 47 |  */ | 
| 48 |  | 
| 49 | #define LWS_FOR_QUERY					  LWS_TF | 
| 50 | #define LWS_IN_USE					 LWS_LOG | 
| 51 | #define PRENORM_IN_USE				     PRENORM_AVG | 
| 52 | #define NORM_IN_USE				      NORM_PIVOT | 
| 53 | #define GWS_IN_USE					GWS_PROB | 
| 54 | /*==============================================================*/ | 
| 55 | #define LWS_TF						  (count) | 
| 56 | #define LWS_BINARY					(count>0) | 
| 57 | #define LWS_SQUARE				    (count*count) | 
| 58 | #define LWS_LOG				 (count?(log( (double) count)+1):0) | 
| 59 | /*--------------------------------------------------------------*/ | 
| 60 | #define PRENORM_NONE				      (p->weight) | 
| 61 | #define PRENORM_MAX			  (p->weight/docstat.max) | 
| 62 | #define PRENORM_AUG		  (0.4+0.6*p->weight/docstat.max) | 
| 63 | #define PRENORM_AVG	     (p->weight/docstat.sum*docstat.uniq) | 
| 64 | #define PRENORM_AVGLOG ((1+log(p->weight))/(1+log(docstat.sum/docstat.uniq))) | 
| 65 | /*--------------------------------------------------------------*/ | 
| 66 | #define NORM_NONE					      (1) | 
| 67 | #define NORM_SUM				   (docstat.nsum) | 
| 68 | #define NORM_COS			    (sqrt(docstat.nsum2)) | 
| 69 |  | 
| 70 | #define PIVOT_VAL (0.0115) | 
| 71 | #define NORM_PIVOT  (1+PIVOT_VAL*docstat.uniq) | 
| 72 | /*---------------------------------------------------------------*/ | 
| 73 | #define GWS_NORM				     (1/sqrt(sum2)) | 
| 74 | #define GWS_GFIDF				      (sum/doc_cnt) | 
| 75 | /* Mysterious, but w/o (double) GWS_IDF performs better :-o */ | 
| 76 | #define GWS_IDF		   log(aio->info->state->records/doc_cnt) | 
| 77 | #define GWS_IDF1	   log((double)aio->info->state->records/doc_cnt) | 
| 78 | #define GWS_PROB ((aio->info->state->records > doc_cnt) ? log(((double)(aio->info->state->records-doc_cnt))/doc_cnt) : 0 ) | 
| 79 | #define GWS_FREQ					(1.0/doc_cnt) | 
| 80 | #define GWS_SQUARED pow(log((double)aio->info->state->records/doc_cnt),2) | 
| 81 | #define GWS_CUBIC   pow(log((double)aio->info->state->records/doc_cnt),3) | 
| 82 | #define GWS_ENTROPY (1-(suml/sum-log(sum))/log(aio->info->state->records)) | 
| 83 | /*=================================================================*/ | 
| 84 |  | 
| 85 | /* Boolean search operators */ | 
| 86 | #define FTB_YES   (ft_boolean_syntax[0]) | 
| 87 | #define FTB_EGAL  (ft_boolean_syntax[1]) | 
| 88 | #define FTB_NO    (ft_boolean_syntax[2]) | 
| 89 | #define FTB_INC   (ft_boolean_syntax[3]) | 
| 90 | #define FTB_DEC   (ft_boolean_syntax[4]) | 
| 91 | #define FTB_LBR   (ft_boolean_syntax[5]) | 
| 92 | #define FTB_RBR   (ft_boolean_syntax[6]) | 
| 93 | #define FTB_NEG   (ft_boolean_syntax[7]) | 
| 94 | #define FTB_TRUNC (ft_boolean_syntax[8]) | 
| 95 | #define FTB_LQUOT (ft_boolean_syntax[10]) | 
| 96 | #define FTB_RQUOT (ft_boolean_syntax[11]) | 
| 97 |  | 
| 98 | typedef struct st_maria_ft_word { | 
| 99 |   const uchar * pos; | 
| 100 |   uint	 len; | 
| 101 |   double weight; | 
| 102 | } FT_WORD; | 
| 103 |  | 
| 104 | int is_stopword(const char *word, size_t len); | 
| 105 |  | 
| 106 | MARIA_KEY *_ma_ft_make_key(MARIA_HA *, MARIA_KEY *, uint , uchar *, FT_WORD *, | 
| 107 |                            my_off_t); | 
| 108 |  | 
| 109 | uchar maria_ft_get_word(CHARSET_INFO *, const uchar **, const uchar *, | 
| 110 |                         FT_WORD *, MYSQL_FTPARSER_BOOLEAN_INFO *); | 
| 111 | uchar maria_ft_simple_get_word(CHARSET_INFO *, uchar **, const uchar *, | 
| 112 |                                FT_WORD *, my_bool); | 
| 113 |  | 
| 114 | typedef struct _st_maria_ft_seg_iterator { | 
| 115 |   uint        num, len; | 
| 116 |   HA_KEYSEG  *seg; | 
| 117 |   const uchar *rec, *pos; | 
| 118 | } FT_SEG_ITERATOR; | 
| 119 |  | 
| 120 | void _ma_ft_segiterator_init(MARIA_HA *, uint, const uchar *, FT_SEG_ITERATOR *); | 
| 121 | void _ma_ft_segiterator_dummy_init(const uchar *, uint, FT_SEG_ITERATOR *); | 
| 122 | uint _ma_ft_segiterator(FT_SEG_ITERATOR *); | 
| 123 |  | 
| 124 | void maria_ft_parse_init(TREE *, CHARSET_INFO *); | 
| 125 | int maria_ft_parse(TREE *, uchar *, int, struct st_mysql_ftparser *parser, | 
| 126 |              MYSQL_FTPARSER_PARAM *, MEM_ROOT *); | 
| 127 | FT_WORD * maria_ft_linearize(TREE *, MEM_ROOT *); | 
| 128 | FT_WORD * _ma_ft_parserecord(MARIA_HA *, uint, const uchar *, MEM_ROOT *); | 
| 129 | uint _ma_ft_parse(TREE *, MARIA_HA *, uint, const uchar *, | 
| 130 |                   MYSQL_FTPARSER_PARAM *, MEM_ROOT *); | 
| 131 |  | 
| 132 | FT_INFO *maria_ft_init_nlq_search(MARIA_HA *, uint, uchar *, uint, uint, | 
| 133 |                                   uchar *); | 
| 134 | FT_INFO *maria_ft_init_boolean_search(MARIA_HA *, uint, uchar *, uint, | 
| 135 |                                       CHARSET_INFO *); | 
| 136 |  | 
| 137 | extern const struct _ft_vft _ma_ft_vft_nlq; | 
| 138 | int maria_ft_nlq_read_next(FT_INFO *, char *); | 
| 139 | float maria_ft_nlq_find_relevance(FT_INFO *, uchar *, uint); | 
| 140 | void maria_ft_nlq_close_search(FT_INFO *); | 
| 141 | float maria_ft_nlq_get_relevance(FT_INFO *); | 
| 142 | my_off_t maria_ft_nlq_get_docid(FT_INFO *); | 
| 143 | void maria_ft_nlq_reinit_search(FT_INFO *); | 
| 144 |  | 
| 145 | extern const struct _ft_vft _ma_ft_vft_boolean; | 
| 146 | int maria_ft_boolean_read_next(FT_INFO *, char *); | 
| 147 | float maria_ft_boolean_find_relevance(FT_INFO *, uchar *, uint); | 
| 148 | void maria_ft_boolean_close_search(FT_INFO *); | 
| 149 | float maria_ft_boolean_get_relevance(FT_INFO *); | 
| 150 | my_off_t maria_ft_boolean_get_docid(FT_INFO *); | 
| 151 | void maria_ft_boolean_reinit_search(FT_INFO *); | 
| 152 | MYSQL_FTPARSER_PARAM* maria_ftparser_alloc_param(MARIA_HA *info); | 
| 153 | extern MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info, | 
| 154 |                                                              uint keynr, | 
| 155 |                                                              uint paramnr); | 
| 156 | extern void maria_ftparser_call_deinitializer(MARIA_HA *info); | 
| 157 |  |