1/* Copyright (c) 2000-2007 MySQL AB, 2009 Sun Microsystems, Inc.
2 Use is subject to license terms.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
16
17/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
18
19/* some definitions for full-text indices */
20
21/* ftdefs.h is is always included first when used, so we have to include my_global.h here */
22#include <my_global.h>
23#include "fulltext.h"
24#include <m_ctype.h>
25#include <my_tree.h>
26#include <queues.h>
27#include <mysql/plugin.h>
28
29#define true_word_char(ctype, character) \
30 ((ctype) & (_MY_U | _MY_L | _MY_NMR) || \
31 (character) == '_')
32#define misc_word_char(X) 0
33
34#define FT_MAX_WORD_LEN_FOR_SORT 31
35
36#define FTPARSER_MEMROOT_ALLOC_SIZE 65536
37
38#define COMPILE_STOPWORDS_IN
39
40/* Interested readers may consult SMART
41 (ftp://ftp.cs.cornell.edu/pub/smart/smart.11.0.tar.Z)
42 for an excellent implementation of vector space model we use.
43 It also demonstrate the usage of different weghting techniques.
44 This code, though, is completely original and is not based on the
45 SMART code but was in some cases inspired by it.
46
47 NORM_PIVOT was taken from the article
48 A.Singhal, C.Buckley, M.Mitra, "Pivoted Document Length Normalization",
49 ACM SIGIR'96, 21-29, 1996
50 */
51
52#define LWS_FOR_QUERY LWS_TF
53#define LWS_IN_USE LWS_LOG
54#define PRENORM_IN_USE PRENORM_AVG
55#define NORM_IN_USE NORM_PIVOT
56#define GWS_IN_USE GWS_PROB
57/*==============================================================*/
58#define LWS_TF (count)
59#define LWS_BINARY (count>0)
60#define LWS_SQUARE (count*count)
61#define LWS_LOG (count?(log( (double) count)+1):0)
62/*--------------------------------------------------------------*/
63#define PRENORM_NONE (p->weight)
64#define PRENORM_MAX (p->weight/docstat.max)
65#define PRENORM_AUG (0.4+0.6*p->weight/docstat.max)
66#define PRENORM_AVG (p->weight/docstat.sum*docstat.uniq)
67#define PRENORM_AVGLOG ((1+log(p->weight))/(1+log(docstat.sum/docstat.uniq)))
68/*--------------------------------------------------------------*/
69#define NORM_NONE (1)
70#define NORM_SUM (docstat.nsum)
71#define NORM_COS (sqrt(docstat.nsum2))
72
73#define PIVOT_VAL (0.0115)
74#define NORM_PIVOT (1+PIVOT_VAL*docstat.uniq)
75/*---------------------------------------------------------------*/
76#define GWS_NORM (1/sqrt(sum2))
77#define GWS_GFIDF (sum/doc_cnt)
78/* Mysterious, but w/o (double) GWS_IDF performs better :-o */
79#define GWS_IDF log(aio->info->state->records/doc_cnt)
80#define GWS_IDF1 log((double)aio->info->state->records/doc_cnt)
81#define GWS_PROB ((aio->info->state->records > doc_cnt) ? log(((double)(aio->info->state->records-doc_cnt))/doc_cnt) : 0 )
82#define GWS_FREQ (1.0/doc_cnt)
83#define GWS_SQUARED pow(log((double)aio->info->state->records/doc_cnt),2)
84#define GWS_CUBIC pow(log((double)aio->info->state->records/doc_cnt),3)
85#define GWS_ENTROPY (1-(suml/sum-log(sum))/log(aio->info->state->records))
86/*=================================================================*/
87
88/* Boolean search operators */
89#define FTB_YES (ft_boolean_syntax[0])
90#define FTB_EGAL (ft_boolean_syntax[1])
91#define FTB_NO (ft_boolean_syntax[2])
92#define FTB_INC (ft_boolean_syntax[3])
93#define FTB_DEC (ft_boolean_syntax[4])
94#define FTB_LBR (ft_boolean_syntax[5])
95#define FTB_RBR (ft_boolean_syntax[6])
96#define FTB_NEG (ft_boolean_syntax[7])
97#define FTB_TRUNC (ft_boolean_syntax[8])
98#define FTB_LQUOT (ft_boolean_syntax[10])
99#define FTB_RQUOT (ft_boolean_syntax[11])
100
101typedef struct st_ft_word {
102 const uchar *pos;
103 double weight;
104 size_t len;
105} FT_WORD;
106
107int is_stopword(const char *word, size_t len);
108
109uint _ft_make_key(MI_INFO *, uint , uchar *, FT_WORD *, my_off_t);
110
111uchar ft_get_word(CHARSET_INFO *, const uchar **, const uchar *, FT_WORD *,
112 MYSQL_FTPARSER_BOOLEAN_INFO *);
113uchar ft_simple_get_word(CHARSET_INFO *, uchar **, const uchar *,
114 FT_WORD *, my_bool);
115
116typedef struct _st_ft_seg_iterator {
117 uint num, len;
118 HA_KEYSEG *seg;
119 const uchar *rec, *pos;
120} FT_SEG_ITERATOR;
121
122void _mi_ft_segiterator_init(MI_INFO *, uint, const uchar *, FT_SEG_ITERATOR *);
123void _mi_ft_segiterator_dummy_init(const uchar *, uint, FT_SEG_ITERATOR *);
124uint _mi_ft_segiterator(FT_SEG_ITERATOR *);
125
126void ft_parse_init(TREE *, CHARSET_INFO *);
127int ft_parse(TREE *, uchar *, int, struct st_mysql_ftparser *parser,
128 MYSQL_FTPARSER_PARAM *, MEM_ROOT *);
129FT_WORD * ft_linearize(TREE *, MEM_ROOT *);
130FT_WORD * _mi_ft_parserecord(MI_INFO *, uint, const uchar *, MEM_ROOT *);
131uint _mi_ft_parse(TREE *, MI_INFO *, uint, const uchar *,
132 MYSQL_FTPARSER_PARAM *, MEM_ROOT *);
133
134FT_INFO *ft_init_nlq_search(MI_INFO *, uint, uchar *, uint, uint, uchar *);
135FT_INFO *ft_init_boolean_search(MI_INFO *, uint, uchar *, uint, CHARSET_INFO *);
136
137extern const struct _ft_vft _ft_vft_nlq;
138int ft_nlq_read_next(FT_INFO *, char *);
139float ft_nlq_find_relevance(FT_INFO *, uchar *, uint);
140void ft_nlq_close_search(FT_INFO *);
141float ft_nlq_get_relevance(FT_INFO *);
142my_off_t ft_nlq_get_docid(FT_INFO *);
143void ft_nlq_reinit_search(FT_INFO *);
144
145extern const struct _ft_vft _ft_vft_boolean;
146int ft_boolean_read_next(FT_INFO *, char *);
147float ft_boolean_find_relevance(FT_INFO *, uchar *, uint);
148void ft_boolean_close_search(FT_INFO *);
149float ft_boolean_get_relevance(FT_INFO *);
150my_off_t ft_boolean_get_docid(FT_INFO *);
151void ft_boolean_reinit_search(FT_INFO *);
152MYSQL_FTPARSER_PARAM* ftparser_alloc_param(MI_INFO *info);
153extern MYSQL_FTPARSER_PARAM *ftparser_call_initializer(MI_INFO *info,
154 uint keynr,
155 uint paramnr);
156extern void ftparser_call_deinitializer(MI_INFO *info);
157