1 | /***************************************************************************** |
2 | |
3 | Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved. |
4 | |
5 | This program is free software; you can redistribute it and/or modify it under |
6 | the terms of the GNU General Public License as published by the Free Software |
7 | Foundation; version 2 of the License. |
8 | |
9 | This program is distributed in the hope that it will be useful, but WITHOUT |
10 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
11 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
12 | |
13 | You should have received a copy of the GNU General Public License along with |
14 | this program; if not, write to the Free Software Foundation, Inc., |
15 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
16 | |
17 | *****************************************************************************/ |
18 | |
19 | /******************************************************************//** |
20 | @file fts/fts0tokenize.cc |
21 | Full Text Search plugin tokenizer refer to MyISAM |
22 | |
23 | Created 2014/11/17 Shaohua Wang |
24 | ***********************************************************************/ |
25 | |
26 | #include "ft_global.h" |
27 | #include "mysql/plugin_ftparser.h" |
28 | #include "m_ctype.h" |
29 | |
30 | /* Macros and structs below are from ftdefs.h in MyISAM */ |
31 | /** Check a char is true word */ |
32 | #define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_') |
33 | |
34 | /** Check if a char is misc word */ |
35 | #define misc_word_char(X) 0 |
36 | |
37 | /** Boolean search syntax */ |
38 | static const char* fts_boolean_syntax = DEFAULT_FTB_SYNTAX; |
39 | |
40 | #define FTB_YES (fts_boolean_syntax[0]) |
41 | #define FTB_EGAL (fts_boolean_syntax[1]) |
42 | #define FTB_NO (fts_boolean_syntax[2]) |
43 | #define FTB_INC (fts_boolean_syntax[3]) |
44 | #define FTB_DEC (fts_boolean_syntax[4]) |
45 | #define FTB_LBR (fts_boolean_syntax[5]) |
46 | #define FTB_RBR (fts_boolean_syntax[6]) |
47 | #define FTB_NEG (fts_boolean_syntax[7]) |
48 | #define FTB_TRUNC (fts_boolean_syntax[8]) |
49 | #define FTB_LQUOT (fts_boolean_syntax[10]) |
50 | #define FTB_RQUOT (fts_boolean_syntax[11]) |
51 | |
52 | /** FTS query token */ |
53 | typedef struct st_ft_word { |
54 | uchar* pos; /*!< word start pointer */ |
55 | uint len; /*!< word len */ |
56 | double weight; /*!< word weight, unused in innodb */ |
57 | } FT_WORD; |
58 | |
59 | /** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM. |
60 | Differences: a. code format changed; b. stopword processing removed. |
61 | @param[in] cs charset |
62 | @param[in,out] start doc start pointer |
63 | @param[in,out] end doc end pointer |
64 | @param[in,out] word token |
65 | @param[in,out] info token info |
66 | @retval 0 eof |
67 | @retval 1 word found |
68 | @retval 2 left bracket |
69 | @retval 3 right bracket |
70 | @retval 4 stopword found */ |
71 | inline |
72 | uchar |
73 | fts_get_word( |
74 | const CHARSET_INFO* cs, |
75 | uchar** start, |
76 | uchar* end, |
77 | FT_WORD* word, |
78 | MYSQL_FTPARSER_BOOLEAN_INFO* |
79 | info) |
80 | { |
81 | uchar* doc = *start; |
82 | int ctype; |
83 | uint mwc; |
84 | uint length; |
85 | int mbl; |
86 | |
87 | info->yesno = (FTB_YES ==' ') ? 1 : (info->quot != 0); |
88 | info->weight_adjust = info->wasign = 0; |
89 | info->type = FT_TOKEN_EOF; |
90 | |
91 | while (doc < end) { |
92 | for (; doc < end; |
93 | doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) { |
94 | mbl = cs->cset->ctype(cs, &ctype, doc, end); |
95 | |
96 | if (true_word_char(ctype, *doc)) { |
97 | break; |
98 | } |
99 | |
100 | if (*doc == FTB_RQUOT && info->quot) { |
101 | *start = doc + 1; |
102 | info->type = FT_TOKEN_RIGHT_PAREN; |
103 | |
104 | return(info->type); |
105 | } |
106 | |
107 | if (!info->quot) { |
108 | if (*doc == FTB_LBR |
109 | || *doc == FTB_RBR |
110 | || *doc == FTB_LQUOT) { |
111 | /* param->prev=' '; */ |
112 | *start = doc + 1; |
113 | if (*doc == FTB_LQUOT) { |
114 | info->quot = (char*)1; |
115 | } |
116 | |
117 | info->type = (*doc == FTB_RBR ? |
118 | FT_TOKEN_RIGHT_PAREN : |
119 | FT_TOKEN_LEFT_PAREN); |
120 | |
121 | return(info->type); |
122 | } |
123 | |
124 | if (info->prev == ' ') { |
125 | if (*doc == FTB_YES) { |
126 | info->yesno = +1; |
127 | continue; |
128 | } else if (*doc == FTB_EGAL) { |
129 | info->yesno = 0; |
130 | continue; |
131 | } else if (*doc == FTB_NO) { |
132 | info->yesno = -1; |
133 | continue; |
134 | } else if (*doc == FTB_INC) { |
135 | info->weight_adjust++; |
136 | continue; |
137 | } else if (*doc == FTB_DEC) { |
138 | info->weight_adjust--; |
139 | continue; |
140 | } else if (*doc == FTB_NEG) { |
141 | info->wasign = !info->wasign; |
142 | continue; |
143 | } |
144 | } |
145 | } |
146 | |
147 | info->prev = char(*doc); |
148 | info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != 0); |
149 | info->weight_adjust = info->wasign = 0; |
150 | } |
151 | |
152 | mwc = length = 0; |
153 | for (word->pos = doc; |
154 | doc < end; |
155 | length++, doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) { |
156 | mbl = cs->cset->ctype(cs, &ctype, doc, end); |
157 | |
158 | if (true_word_char(ctype, *doc)) { |
159 | mwc = 0; |
160 | } else if (!misc_word_char(*doc) || mwc) { |
161 | break; |
162 | } else { |
163 | mwc++; |
164 | } |
165 | } |
166 | |
167 | /* Be sure *prev is true_word_char. */ |
168 | info->prev = 'A'; |
169 | word->len = (uint)(doc-word->pos) - mwc; |
170 | |
171 | if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) { |
172 | doc++; |
173 | } |
174 | |
175 | /* We don't check stopword here. */ |
176 | *start = doc; |
177 | info->type = FT_TOKEN_WORD; |
178 | |
179 | return(info->type); |
180 | } |
181 | |
182 | if (info->quot) { |
183 | *start = doc; |
184 | info->type = FT_TOKEN_RIGHT_PAREN; |
185 | } |
186 | |
187 | return(info->type); |
188 | } |
189 | |