1/*****************************************************************************
2
3Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
4
5This program is free software; you can redistribute it and/or modify it under
6the terms of the GNU General Public License as published by the Free Software
7Foundation; version 2 of the License.
8
9This program is distributed in the hope that it will be useful, but WITHOUT
10ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12
13You should have received a copy of the GNU General Public License along with
14this program; if not, write to the Free Software Foundation, Inc.,
1551 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
16
17*****************************************************************************/
18
19/******************************************************************//**
20@file fts/fts0tokenize.cc
21Full Text Search plugin tokenizer refer to MyISAM
22
23Created 2014/11/17 Shaohua Wang
24***********************************************************************/
25
26#include "ft_global.h"
27#include "mysql/plugin_ftparser.h"
28#include "m_ctype.h"
29
30/* Macros and structs below are from ftdefs.h in MyISAM */
31/** Check a char is true word */
32#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')
33
34/** Check if a char is misc word */
35#define misc_word_char(X) 0
36
37/** Boolean search syntax */
38static const char* fts_boolean_syntax = DEFAULT_FTB_SYNTAX;
39
40#define FTB_YES (fts_boolean_syntax[0])
41#define FTB_EGAL (fts_boolean_syntax[1])
42#define FTB_NO (fts_boolean_syntax[2])
43#define FTB_INC (fts_boolean_syntax[3])
44#define FTB_DEC (fts_boolean_syntax[4])
45#define FTB_LBR (fts_boolean_syntax[5])
46#define FTB_RBR (fts_boolean_syntax[6])
47#define FTB_NEG (fts_boolean_syntax[7])
48#define FTB_TRUNC (fts_boolean_syntax[8])
49#define FTB_LQUOT (fts_boolean_syntax[10])
50#define FTB_RQUOT (fts_boolean_syntax[11])
51
52/** FTS query token */
53typedef struct st_ft_word {
54 uchar* pos; /*!< word start pointer */
55 uint len; /*!< word len */
56 double weight; /*!< word weight, unused in innodb */
57} FT_WORD;
58
59/** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
60Differences: a. code format changed; b. stopword processing removed.
61@param[in] cs charset
62@param[in,out] start doc start pointer
63@param[in,out] end doc end pointer
64@param[in,out] word token
65@param[in,out] info token info
66@retval 0 eof
67@retval 1 word found
68@retval 2 left bracket
69@retval 3 right bracket
70@retval 4 stopword found */
71inline
72uchar
73fts_get_word(
74 const CHARSET_INFO* cs,
75 uchar** start,
76 uchar* end,
77 FT_WORD* word,
78 MYSQL_FTPARSER_BOOLEAN_INFO*
79 info)
80{
81 uchar* doc = *start;
82 int ctype;
83 uint mwc;
84 uint length;
85 int mbl;
86
87 info->yesno = (FTB_YES ==' ') ? 1 : (info->quot != 0);
88 info->weight_adjust = info->wasign = 0;
89 info->type = FT_TOKEN_EOF;
90
91 while (doc < end) {
92 for (; doc < end;
93 doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
94 mbl = cs->cset->ctype(cs, &ctype, doc, end);
95
96 if (true_word_char(ctype, *doc)) {
97 break;
98 }
99
100 if (*doc == FTB_RQUOT && info->quot) {
101 *start = doc + 1;
102 info->type = FT_TOKEN_RIGHT_PAREN;
103
104 return(info->type);
105 }
106
107 if (!info->quot) {
108 if (*doc == FTB_LBR
109 || *doc == FTB_RBR
110 || *doc == FTB_LQUOT) {
111 /* param->prev=' '; */
112 *start = doc + 1;
113 if (*doc == FTB_LQUOT) {
114 info->quot = (char*)1;
115 }
116
117 info->type = (*doc == FTB_RBR ?
118 FT_TOKEN_RIGHT_PAREN :
119 FT_TOKEN_LEFT_PAREN);
120
121 return(info->type);
122 }
123
124 if (info->prev == ' ') {
125 if (*doc == FTB_YES) {
126 info->yesno = +1;
127 continue;
128 } else if (*doc == FTB_EGAL) {
129 info->yesno = 0;
130 continue;
131 } else if (*doc == FTB_NO) {
132 info->yesno = -1;
133 continue;
134 } else if (*doc == FTB_INC) {
135 info->weight_adjust++;
136 continue;
137 } else if (*doc == FTB_DEC) {
138 info->weight_adjust--;
139 continue;
140 } else if (*doc == FTB_NEG) {
141 info->wasign = !info->wasign;
142 continue;
143 }
144 }
145 }
146
147 info->prev = char(*doc);
148 info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != 0);
149 info->weight_adjust = info->wasign = 0;
150 }
151
152 mwc = length = 0;
153 for (word->pos = doc;
154 doc < end;
155 length++, doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
156 mbl = cs->cset->ctype(cs, &ctype, doc, end);
157
158 if (true_word_char(ctype, *doc)) {
159 mwc = 0;
160 } else if (!misc_word_char(*doc) || mwc) {
161 break;
162 } else {
163 mwc++;
164 }
165 }
166
167 /* Be sure *prev is true_word_char. */
168 info->prev = 'A';
169 word->len = (uint)(doc-word->pos) - mwc;
170
171 if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) {
172 doc++;
173 }
174
175 /* We don't check stopword here. */
176 *start = doc;
177 info->type = FT_TOKEN_WORD;
178
179 return(info->type);
180 }
181
182 if (info->quot) {
183 *start = doc;
184 info->type = FT_TOKEN_RIGHT_PAREN;
185 }
186
187 return(info->type);
188}
189