1 | /* Copyright (c) 2005 MySQL AB, 2009 Sun Microsystems, Inc. |
2 | Use is subject to license terms. |
3 | |
4 | This program is free software; you can redistribute it and/or modify |
5 | it under the terms of the GNU General Public License as published by |
6 | the Free Software Foundation; version 2 of the License. |
7 | |
8 | This program is distributed in the hope that it will be useful, |
9 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | GNU General Public License for more details. |
12 | |
13 | You should have received a copy of the GNU General Public License |
14 | along with this program; if not, write to the Free Software |
15 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ |
16 | |
17 | #ifndef _my_plugin_ftparser_h |
18 | #define _my_plugin_ftparser_h |
19 | #include "plugin.h" |
20 | |
21 | #ifdef __cplusplus |
22 | extern "C" { |
23 | #endif |
24 | |
25 | /************************************************************************* |
26 | API for Full-text parser plugin. (MYSQL_FTPARSER_PLUGIN) |
27 | */ |
28 | |
29 | #define MYSQL_FTPARSER_INTERFACE_VERSION 0x0100 |
30 | |
31 | /* Parsing modes. Set in MYSQL_FTPARSER_PARAM::mode */ |
32 | enum enum_ftparser_mode |
33 | { |
34 | /* |
35 | Fast and simple mode. This mode is used for indexing, and natural |
36 | language queries. |
37 | |
38 | The parser is expected to return only those words that go into the |
39 | index. Stopwords or too short/long words should not be returned. The |
40 | 'boolean_info' argument of mysql_add_word() does not have to be set. |
41 | */ |
42 | MYSQL_FTPARSER_SIMPLE_MODE= 0, |
43 | |
44 | /* |
45 | Parse with stopwords mode. This mode is used in boolean searches for |
46 | "phrase matching." |
47 | |
48 | The parser is not allowed to ignore words in this mode. Every word |
49 | should be returned, including stopwords and words that are too short |
50 | or long. The 'boolean_info' argument of mysql_add_word() does not |
51 | have to be set. |
52 | */ |
53 | MYSQL_FTPARSER_WITH_STOPWORDS= 1, |
54 | |
55 | /* |
56 | Parse in boolean mode. This mode is used to parse a boolean query string. |
57 | |
58 | The parser should provide a valid MYSQL_FTPARSER_BOOLEAN_INFO |
59 | structure in the 'boolean_info' argument to mysql_add_word(). |
60 | Usually that means that the parser should recognize boolean operators |
61 | in the parsing stream and set appropriate fields in |
62 | MYSQL_FTPARSER_BOOLEAN_INFO structure accordingly. As for |
63 | MYSQL_FTPARSER_WITH_STOPWORDS mode, no word should be ignored. |
64 | Instead, use FT_TOKEN_STOPWORD for the token type of such a word. |
65 | */ |
66 | MYSQL_FTPARSER_FULL_BOOLEAN_INFO= 2 |
67 | }; |
68 | |
69 | /* |
70 | Token types for boolean mode searching (used for the type member of |
71 | MYSQL_FTPARSER_BOOLEAN_INFO struct) |
72 | |
73 | FT_TOKEN_EOF: End of data. |
74 | FT_TOKEN_WORD: Regular word. |
75 | FT_TOKEN_LEFT_PAREN: Left parenthesis (start of group/sub-expression). |
76 | FT_TOKEN_RIGHT_PAREN: Right parenthesis (end of group/sub-expression). |
77 | FT_TOKEN_STOPWORD: Stopword. |
78 | */ |
79 | |
80 | enum enum_ft_token_type |
81 | { |
82 | FT_TOKEN_EOF= 0, |
83 | FT_TOKEN_WORD= 1, |
84 | FT_TOKEN_LEFT_PAREN= 2, |
85 | FT_TOKEN_RIGHT_PAREN= 3, |
86 | FT_TOKEN_STOPWORD= 4 |
87 | }; |
88 | |
89 | /* |
90 | This structure is used in boolean search mode only. It conveys |
91 | boolean-mode metadata to the MySQL search engine for every word in |
92 | the search query. A valid instance of this structure must be filled |
93 | in by the plugin parser and passed as an argument in the call to |
94 | mysql_add_word (the callback function in the MYSQL_FTPARSER_PARAM |
95 | structure) when a query is parsed in boolean mode. |
96 | |
97 | type: The token type. Should be one of the enum_ft_token_type values. |
98 | |
99 | yesno: Whether the word must be present for a match to occur: |
100 | >0 Must be present |
101 | <0 Must not be present |
102 | 0 Neither; the word is optional but its presence increases the relevance |
103 | With the default settings of the ft_boolean_syntax system variable, |
104 | >0 corresponds to the '+' operator, <0 corrresponds to the '-' operator, |
105 | and 0 means neither operator was used. |
106 | |
107 | weight_adjust: A weighting factor that determines how much a match |
108 | for the word counts. Positive values increase, negative - decrease the |
109 | relative word's importance in the query. |
110 | |
111 | wasign: The sign of the word's weight in the query. If it's non-negative |
112 | the match for the word will increase document relevance, if it's |
113 | negative - decrease (the word becomes a "noise word", the less of it the |
114 | better). |
115 | |
116 | trunc: Corresponds to the '*' operator in the default setting of the |
117 | ft_boolean_syntax system variable. |
118 | */ |
119 | |
120 | typedef struct st_mysql_ftparser_boolean_info |
121 | { |
122 | enum enum_ft_token_type type; |
123 | int yesno; |
124 | int weight_adjust; |
125 | char wasign; |
126 | char trunc; |
127 | /* These are parser state and must be removed. */ |
128 | char prev; |
129 | char *quot; |
130 | } MYSQL_FTPARSER_BOOLEAN_INFO; |
131 | |
132 | /* |
133 | The following flag means that buffer with a string (document, word) |
134 | may be overwritten by the caller before the end of the parsing (that is |
135 | before st_mysql_ftparser::deinit() call). If one needs the string |
136 | to survive between two successive calls of the parsing function, she |
137 | needs to save a copy of it. The flag may be set by MySQL before calling |
138 | st_mysql_ftparser::parse(), or it may be set by a plugin before calling |
139 | st_mysql_ftparser_param::mysql_parse() or |
140 | st_mysql_ftparser_param::mysql_add_word(). |
141 | */ |
142 | #define MYSQL_FTFLAGS_NEED_COPY 1 |
143 | |
144 | /* |
145 | An argument of the full-text parser plugin. This structure is |
146 | filled in by MySQL server and passed to the parsing function of the |
147 | plugin as an in/out parameter. |
148 | |
149 | mysql_parse: A pointer to the built-in parser implementation of the |
150 | server. It's set by the server and can be used by the parser plugin |
151 | to invoke the MySQL default parser. If plugin's role is to extract |
152 | textual data from .doc, .pdf or .xml content, it might extract |
153 | plaintext from the content, and then pass the text to the default |
154 | MySQL parser to be parsed. |
155 | |
156 | mysql_add_word: A server callback to add a new word. When parsing |
157 | a document, the server sets this to point at a function that adds |
158 | the word to MySQL full-text index. When parsing a search query, |
159 | this function will add the new word to the list of words to search |
160 | for. The boolean_info argument can be NULL for all cases except |
161 | when mode is MYSQL_FTPARSER_FULL_BOOLEAN_INFO. |
162 | |
163 | ftparser_state: A generic pointer. The plugin can set it to point |
164 | to information to be used internally for its own purposes. |
165 | |
166 | mysql_ftparam: This is set by the server. It is used by MySQL functions |
167 | called via mysql_parse() and mysql_add_word() callback. The plugin |
168 | should not modify it. |
169 | |
170 | cs: Information about the character set of the document or query string. |
171 | |
172 | doc: A pointer to the document or query string to be parsed. |
173 | |
174 | length: Length of the document or query string, in bytes. |
175 | |
176 | flags: See MYSQL_FTFLAGS_* constants above. |
177 | |
178 | mode: The parsing mode. With boolean operators, with stopwords, or |
179 | nothing. See enum_ftparser_mode above. |
180 | */ |
181 | |
182 | typedef struct st_mysql_ftparser_param |
183 | { |
184 | int (*mysql_parse)(struct st_mysql_ftparser_param *, |
185 | const char *doc, int doc_len); |
186 | int (*mysql_add_word)(struct st_mysql_ftparser_param *, |
187 | const char *word, int word_len, |
188 | MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info); |
189 | void *ftparser_state; |
190 | void *mysql_ftparam; |
191 | const struct charset_info_st *cs; |
192 | const char *doc; |
193 | int length; |
194 | unsigned int flags; |
195 | enum enum_ftparser_mode mode; |
196 | } MYSQL_FTPARSER_PARAM; |
197 | |
198 | /* |
199 | Full-text parser descriptor. |
200 | |
201 | interface_version is, e.g., MYSQL_FTPARSER_INTERFACE_VERSION. |
202 | The parsing, initialization, and deinitialization functions are |
203 | invoked per SQL statement for which the parser is used. |
204 | */ |
205 | |
206 | struct st_mysql_ftparser |
207 | { |
208 | int interface_version; |
209 | int (*parse)(MYSQL_FTPARSER_PARAM *param); |
210 | int (*init)(MYSQL_FTPARSER_PARAM *param); |
211 | int (*deinit)(MYSQL_FTPARSER_PARAM *param); |
212 | }; |
213 | |
214 | |
215 | #ifdef __cplusplus |
216 | } |
217 | #endif |
218 | |
219 | #endif |
220 | |
221 | |