fts0tokenize.h source code [MariaDB/storage/innobase/include/fts0tokenize.h]

1	/*****************************************************************************
2
3	Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
4
5	This program is free software; you can redistribute it and/or modify it under
6	the terms of the GNU General Public License as published by the Free Software
7	Foundation; version 2 of the License.
8
9	This program is distributed in the hope that it will be useful, but WITHOUT
10	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12
13	You should have received a copy of the GNU General Public License along with
14	this program; if not, write to the Free Software Foundation, Inc.,
15	51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
16
17	*****************************************************************************/
18
19	/****************************************************************//**
20	@file fts/fts0tokenize.cc
21	Full Text Search plugin tokenizer refer to MyISAM
22
23	Created 2014/11/17 Shaohua Wang
24	***********************************************************************/
25
26	#include "ft_global.h"
27	#include "mysql/plugin_ftparser.h"
28	#include "m_ctype.h"
29
30	/ Macros and structs below are from ftdefs.h in MyISAM /
31	/* Check a char is true word /
32	#define true_word_char(c, ch) ((c) & (_MY_U \| _MY_L \| _MY_NMR) \|\| (ch) == '_')
33
34	/* Check if a char is misc word /
35	#define misc_word_char(X) 0
36
37	/* Boolean search syntax /
38	static const char* fts_boolean_syntax = DEFAULT_FTB_SYNTAX;
39
40	#define FTB_YES (fts_boolean_syntax[0])
41	#define FTB_EGAL (fts_boolean_syntax[1])
42	#define FTB_NO (fts_boolean_syntax[2])
43	#define FTB_INC (fts_boolean_syntax[3])
44	#define FTB_DEC (fts_boolean_syntax[4])
45	#define FTB_LBR (fts_boolean_syntax[5])
46	#define FTB_RBR (fts_boolean_syntax[6])
47	#define FTB_NEG (fts_boolean_syntax[7])
48	#define FTB_TRUNC (fts_boolean_syntax[8])
49	#define FTB_LQUOT (fts_boolean_syntax[10])
50	#define FTB_RQUOT (fts_boolean_syntax[11])
51
52	/* FTS query token /
53	typedef struct st_ft_word {
54	uchar* pos; /!< word start pointer /
55	uint len; /!< word len /
56	double weight; /!< word weight, unused in innodb /
57	} FT_WORD;
58
59	/* Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.*
60	Differences: a. code format changed; b. stopword processing removed.
61	@param[in] cs charset
62	@param[in,out] start doc start pointer
63	@param[in,out] end doc end pointer
64	@param[in,out] word token
65	@param[in,out] info token info
66	@retval 0 eof
67	@retval 1 word found
68	@retval 2 left bracket
69	@retval 3 right bracket
70	@retval 4 stopword found /*
71	inline
72	uchar
73	fts_get_word(
74	const CHARSET_INFO* cs,
75	uchar** start,
76	uchar* end,
77	FT_WORD* word,
78	MYSQL_FTPARSER_BOOLEAN_INFO*
79	info)
80	{
81	uchar* doc = *start;
82	int ctype;
83	uint mwc;
84	uint length;
85	int mbl;
86
87	info->yesno = (FTB_YES ==`' '`) ? `1` : (info->quot != `0`);
88	info->weight_adjust = info->wasign = `0`;
89	info->type = FT_TOKEN_EOF;
90
91	while (doc < end) {
92	for (; doc < end;
93	doc += (mbl > `0` ? mbl : (mbl < `0` ? -mbl : `1`))) {
94	mbl = cs->cset->ctype(cs, &ctype, doc, end);
95
96	if (true_word_char(ctype, *doc)) {
97	break;
98	}
99
100	if (*doc == FTB_RQUOT && info->quot) {
101	*start = doc + `1`;
102	info->type = FT_TOKEN_RIGHT_PAREN;
103
104	return(info->type);
105	}
106
107	if (!info->quot) {
108	if (*doc == FTB_LBR
109	\|\| *doc == FTB_RBR
110	\|\| *doc == FTB_LQUOT) {
111	/ param->prev=' '; /
112	*start = doc + `1`;
113	if (*doc == FTB_LQUOT) {
114	info->quot = (char*)`1`;
115	}
116
117	info->type = (*doc == FTB_RBR ?
118	FT_TOKEN_RIGHT_PAREN :
119	FT_TOKEN_LEFT_PAREN);
120
121	return(info->type);
122	}
123
124	if (info->prev == `' '`) {
125	if (*doc == FTB_YES) {
126	info->yesno = +`1`;
127	continue;
128	} else if (*doc == FTB_EGAL) {
129	info->yesno = `0`;
130	continue;
131	} else if (*doc == FTB_NO) {
132	info->yesno = -`1`;
133	continue;
134	} else if (*doc == FTB_INC) {
135	info->weight_adjust++;
136	continue;
137	} else if (*doc == FTB_DEC) {
138	info->weight_adjust--;
139	continue;
140	} else if (*doc == FTB_NEG) {
141	info->wasign = !info->wasign;
142	continue;
143	}
144	}
145	}
146
147	info->prev = char(*doc);
148	info->yesno = (FTB_YES == `' '`) ? `1` : (info->quot != `0`);
149	info->weight_adjust = info->wasign = `0`;
150	}
151
152	mwc = length = `0`;
153	for (word->pos = doc;
154	doc < end;
155	length++, doc += (mbl > `0` ? mbl : (mbl < `0` ? -mbl : `1`))) {
156	mbl = cs->cset->ctype(cs, &ctype, doc, end);
157
158	if (true_word_char(ctype, *doc)) {
159	mwc = `0`;
160	} else if (!misc_word_char(*doc) \|\| mwc) {
161	break;
162	} else {
163	mwc++;
164	}
165	}
166
167	/ Be sure prev is true_word_char. /*
168	info->prev = `'A'`;
169	word->len = (uint)(doc-word->pos) - mwc;
170
171	if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) {
172	doc++;
173	}
174
175	/ We don't check stopword here. /
176	*start = doc;
177	info->type = FT_TOKEN_WORD;
178
179	return(info->type);
180	}
181
182	if (info->quot) {
183	*start = doc;
184	info->type = FT_TOKEN_RIGHT_PAREN;
185	}
186
187	return(info->type);
188	}
189

Browse the source code of MariaDB/storage/innobase/include/fts0tokenize.h