1 | /* Copyright (c) 2000-2007 MySQL AB, 2009 Sun Microsystems, Inc. |
2 | Use is subject to license terms. |
3 | |
4 | This program is free software; you can redistribute it and/or modify |
5 | it under the terms of the GNU General Public License as published by |
6 | the Free Software Foundation; version 2 of the License. |
7 | |
8 | This program is distributed in the hope that it will be useful, |
9 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | GNU General Public License for more details. |
12 | |
13 | You should have received a copy of the GNU General Public License |
14 | along with this program; if not, write to the Free Software |
15 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ |
16 | |
17 | /* Written by Sergei A. Golubchik, who has a shared copyright to this code */ |
18 | |
19 | /* some definitions for full-text indices */ |
20 | |
21 | /* ftdefs.h is is always included first when used, so we have to include my_global.h here */ |
22 | #include <my_global.h> |
23 | #include "fulltext.h" |
24 | #include <m_ctype.h> |
25 | #include <my_tree.h> |
26 | #include <queues.h> |
27 | #include <mysql/plugin.h> |
28 | |
29 | #define true_word_char(ctype, character) \ |
30 | ((ctype) & (_MY_U | _MY_L | _MY_NMR) || \ |
31 | (character) == '_') |
32 | #define misc_word_char(X) 0 |
33 | |
34 | #define FT_MAX_WORD_LEN_FOR_SORT 31 |
35 | |
36 | #define FTPARSER_MEMROOT_ALLOC_SIZE 65536 |
37 | |
38 | #define COMPILE_STOPWORDS_IN |
39 | |
40 | /* Interested readers may consult SMART |
41 | (ftp://ftp.cs.cornell.edu/pub/smart/smart.11.0.tar.Z) |
42 | for an excellent implementation of vector space model we use. |
43 | It also demonstrate the usage of different weghting techniques. |
44 | This code, though, is completely original and is not based on the |
45 | SMART code but was in some cases inspired by it. |
46 | |
47 | NORM_PIVOT was taken from the article |
48 | A.Singhal, C.Buckley, M.Mitra, "Pivoted Document Length Normalization", |
49 | ACM SIGIR'96, 21-29, 1996 |
50 | */ |
51 | |
52 | #define LWS_FOR_QUERY LWS_TF |
53 | #define LWS_IN_USE LWS_LOG |
54 | #define PRENORM_IN_USE PRENORM_AVG |
55 | #define NORM_IN_USE NORM_PIVOT |
56 | #define GWS_IN_USE GWS_PROB |
57 | /*==============================================================*/ |
58 | #define LWS_TF (count) |
59 | #define LWS_BINARY (count>0) |
60 | #define LWS_SQUARE (count*count) |
61 | #define LWS_LOG (count?(log( (double) count)+1):0) |
62 | /*--------------------------------------------------------------*/ |
63 | #define PRENORM_NONE (p->weight) |
64 | #define PRENORM_MAX (p->weight/docstat.max) |
65 | #define PRENORM_AUG (0.4+0.6*p->weight/docstat.max) |
66 | #define PRENORM_AVG (p->weight/docstat.sum*docstat.uniq) |
67 | #define PRENORM_AVGLOG ((1+log(p->weight))/(1+log(docstat.sum/docstat.uniq))) |
68 | /*--------------------------------------------------------------*/ |
69 | #define NORM_NONE (1) |
70 | #define NORM_SUM (docstat.nsum) |
71 | #define NORM_COS (sqrt(docstat.nsum2)) |
72 | |
73 | #define PIVOT_VAL (0.0115) |
74 | #define NORM_PIVOT (1+PIVOT_VAL*docstat.uniq) |
75 | /*---------------------------------------------------------------*/ |
76 | #define GWS_NORM (1/sqrt(sum2)) |
77 | #define GWS_GFIDF (sum/doc_cnt) |
78 | /* Mysterious, but w/o (double) GWS_IDF performs better :-o */ |
79 | #define GWS_IDF log(aio->info->state->records/doc_cnt) |
80 | #define GWS_IDF1 log((double)aio->info->state->records/doc_cnt) |
81 | #define GWS_PROB ((aio->info->state->records > doc_cnt) ? log(((double)(aio->info->state->records-doc_cnt))/doc_cnt) : 0 ) |
82 | #define GWS_FREQ (1.0/doc_cnt) |
83 | #define GWS_SQUARED pow(log((double)aio->info->state->records/doc_cnt),2) |
84 | #define GWS_CUBIC pow(log((double)aio->info->state->records/doc_cnt),3) |
85 | #define GWS_ENTROPY (1-(suml/sum-log(sum))/log(aio->info->state->records)) |
86 | /*=================================================================*/ |
87 | |
88 | /* Boolean search operators */ |
89 | #define FTB_YES (ft_boolean_syntax[0]) |
90 | #define FTB_EGAL (ft_boolean_syntax[1]) |
91 | #define FTB_NO (ft_boolean_syntax[2]) |
92 | #define FTB_INC (ft_boolean_syntax[3]) |
93 | #define FTB_DEC (ft_boolean_syntax[4]) |
94 | #define FTB_LBR (ft_boolean_syntax[5]) |
95 | #define FTB_RBR (ft_boolean_syntax[6]) |
96 | #define FTB_NEG (ft_boolean_syntax[7]) |
97 | #define FTB_TRUNC (ft_boolean_syntax[8]) |
98 | #define FTB_LQUOT (ft_boolean_syntax[10]) |
99 | #define FTB_RQUOT (ft_boolean_syntax[11]) |
100 | |
101 | typedef struct st_ft_word { |
102 | const uchar *pos; |
103 | double weight; |
104 | size_t len; |
105 | } FT_WORD; |
106 | |
107 | int is_stopword(const char *word, size_t len); |
108 | |
109 | uint _ft_make_key(MI_INFO *, uint , uchar *, FT_WORD *, my_off_t); |
110 | |
111 | uchar ft_get_word(CHARSET_INFO *, const uchar **, const uchar *, FT_WORD *, |
112 | MYSQL_FTPARSER_BOOLEAN_INFO *); |
113 | uchar ft_simple_get_word(CHARSET_INFO *, uchar **, const uchar *, |
114 | FT_WORD *, my_bool); |
115 | |
116 | typedef struct _st_ft_seg_iterator { |
117 | uint num, len; |
118 | HA_KEYSEG *seg; |
119 | const uchar *rec, *pos; |
120 | } FT_SEG_ITERATOR; |
121 | |
122 | void _mi_ft_segiterator_init(MI_INFO *, uint, const uchar *, FT_SEG_ITERATOR *); |
123 | void _mi_ft_segiterator_dummy_init(const uchar *, uint, FT_SEG_ITERATOR *); |
124 | uint _mi_ft_segiterator(FT_SEG_ITERATOR *); |
125 | |
126 | void ft_parse_init(TREE *, CHARSET_INFO *); |
127 | int ft_parse(TREE *, uchar *, int, struct st_mysql_ftparser *parser, |
128 | MYSQL_FTPARSER_PARAM *, MEM_ROOT *); |
129 | FT_WORD * ft_linearize(TREE *, MEM_ROOT *); |
130 | FT_WORD * _mi_ft_parserecord(MI_INFO *, uint, const uchar *, MEM_ROOT *); |
131 | uint _mi_ft_parse(TREE *, MI_INFO *, uint, const uchar *, |
132 | MYSQL_FTPARSER_PARAM *, MEM_ROOT *); |
133 | |
134 | FT_INFO *ft_init_nlq_search(MI_INFO *, uint, uchar *, uint, uint, uchar *); |
135 | FT_INFO *ft_init_boolean_search(MI_INFO *, uint, uchar *, uint, CHARSET_INFO *); |
136 | |
137 | extern const struct _ft_vft _ft_vft_nlq; |
138 | int ft_nlq_read_next(FT_INFO *, char *); |
139 | float ft_nlq_find_relevance(FT_INFO *, uchar *, uint); |
140 | void ft_nlq_close_search(FT_INFO *); |
141 | float ft_nlq_get_relevance(FT_INFO *); |
142 | my_off_t ft_nlq_get_docid(FT_INFO *); |
143 | void ft_nlq_reinit_search(FT_INFO *); |
144 | |
145 | extern const struct _ft_vft _ft_vft_boolean; |
146 | int ft_boolean_read_next(FT_INFO *, char *); |
147 | float ft_boolean_find_relevance(FT_INFO *, uchar *, uint); |
148 | void ft_boolean_close_search(FT_INFO *); |
149 | float ft_boolean_get_relevance(FT_INFO *); |
150 | my_off_t ft_boolean_get_docid(FT_INFO *); |
151 | void ft_boolean_reinit_search(FT_INFO *); |
152 | MYSQL_FTPARSER_PARAM* ftparser_alloc_param(MI_INFO *info); |
153 | extern MYSQL_FTPARSER_PARAM *ftparser_call_initializer(MI_INFO *info, |
154 | uint keynr, |
155 | uint paramnr); |
156 | extern void ftparser_call_deinitializer(MI_INFO *info); |
157 | |