ft_parser.c source code [MariaDB/storage/myisam/ft_parser.c]

1	/ Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.*
2
3	This program is free software; you can redistribute it and/or modify
4	it under the terms of the GNU General Public License as published by
5	the Free Software Foundation; version 2 of the License.
6
7	This program is distributed in the hope that it will be useful,
8	but WITHOUT ANY WARRANTY; without even the implied warranty of
9	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	GNU General Public License for more details.
11
12	You should have received a copy of the GNU General Public License
13	along with this program; if not, write to the Free Software
14	Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA /*
15
16	/ Written by Sergei A. Golubchik, who has a shared copyright to this code /
17
18	#include "ftdefs.h"
19
20	typedef struct st_ft_docstat {
21	FT_WORD *list;
22	uint uniq;
23	double sum;
24	} FT_DOCSTAT;
25
26	typedef struct st_my_ft_parser_param
27	{
28	TREE *wtree;
29	MEM_ROOT *mem_root;
30	} MY_FT_PARSER_PARAM;
31
32	static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD w1, FT_WORD w2)
33	{
34	return ha_compare_text(cs, (uchar*) w1->pos, w1->len,
35	(uchar*) w2->pos, w2->len, `0`);
36	}
37
38	static int walk_and_copy(FT_WORD word,uint32 count,FT_DOCSTAT docstat)
39	{
40	word->weight=LWS_IN_USE;
41	docstat->sum+=word->weight;
42	memcpy((docstat->list)++, word, sizeof(FT_WORD));
43	return `0`;
44	}
45
46	/ transforms tree of words into the array, applying normalization /
47
48	FT_WORD * ft_linearize(TREE wtree, MEM_ROOT mem_root)
49	{
50	FT_WORD wlist,p;
51	FT_DOCSTAT docstat;
52	DBUG_ENTER("ft_linearize");
53
54	if ((wlist=(FT_WORD ) alloc_root(mem_root, sizeof(FT_WORD)
55	(`1`+wtree->elements_in_tree))))
56	{
57	docstat.list=wlist;
58	docstat.uniq=wtree->elements_in_tree;
59	docstat.sum=`0`;
60	tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right);
61	}
62	delete_tree(wtree, `0`);
63	if (!wlist)
64	DBUG_RETURN(NULL);
65
66	docstat.list->pos=NULL;
67
68	for (p=wlist;p->pos;p++)
69	{
70	p->weight=PRENORM_IN_USE;
71	}
72
73	for (p=wlist;p->pos;p++)
74	{
75	p->weight/=NORM_IN_USE;
76	}
77
78	DBUG_RETURN(wlist);
79	}
80
81	my_bool ft_boolean_check_syntax_string(const uchar *str)
82	{
83	uint i, j;
84
85	if (!str \|\|
86	(strlen((char) str)+`1` != sizeof*(DEFAULT_FTB_SYNTAX)) \|\|
87	(str[`0`] != `' '` && str[`1`] != `' '`))
88	return `1`;
89	for (i=`0`; i<sizeof(DEFAULT_FTB_SYNTAX); i++)
90	{
91	/ limiting to 7-bit ascii only /
92	if ((unsigned char)(str[i]) > `127` \|\| my_isalnum(default_charset_info, str[i]))
93	return `1`;
94	for (j=`0`; j<i; j++)
95	if (str[i] == str[j] && (i != `11` \|\| j != `10`))
96	return `1`;
97	}
98	return `0`;
99	}
100
101	/*
102	RETURN VALUE
103	0 - eof
104	1 - word found
105	2 - left bracket
106	3 - right bracket
107	4 - stopword found
108	*/
109	uchar ft_get_word(CHARSET_INFO cs, const* uchar *start, const* uchar *end,
110	FT_WORD word, MYSQL_FTPARSER_BOOLEAN_INFO param)
111	{
112	const uchar doc=start;
113	int ctype;
114	uint mwc, length;
115	int mbl;
116
117	param->yesno=(FTB_YES==`' '`) ? `1` : (param->quot != `0`);
118	param->weight_adjust= param->wasign= `0`;
119	param->type= FT_TOKEN_EOF;
120
121	while (doc<end)
122	{
123	for (; doc < end; doc+= (mbl > `0` ? mbl : (mbl < `0` ? -mbl : `1`)))
124	{
125	mbl= cs->cset->ctype(cs, &ctype, (uchar)doc, (uchar)end);
126	if (true_word_char(ctype, *doc))
127	break;
128	if (*doc == FTB_RQUOT && param->quot)
129	{
130	*start=doc+`1`;
131	param->type= FT_TOKEN_RIGHT_PAREN;
132	goto ret;
133	}
134	if (!param->quot)
135	{
136	if (doc == FTB_LBR \|\| doc == FTB_RBR \|\| *doc == FTB_LQUOT)
137	{
138	/ param->prev=' '; /
139	*start=doc+`1`;
140	if (*doc == FTB_LQUOT)
141	param->quot= (char*) `1`;
142	param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN);
143	goto ret;
144	}
145	if (param->prev == `' '`)
146	{
147	if (doc == FTB_YES ) { param->yesno=+`1`; continue; } else*
148	if (doc == FTB_EGAL) { param->yesno= `0`; continue; } else*
149	if (doc == FTB_NO ) { param->yesno=-`1`; continue; } else*
150	if (doc == FTB_INC ) { param->weight_adjust++; continue; } else*
151	if (doc == FTB_DEC ) { param->weight_adjust--; continue; } else*
152	if (doc == FTB_NEG ) { param->wasign= !param->wasign; continue*; }
153	}
154	}
155	param->prev=*doc;
156	param->yesno=(FTB_YES==`' '`) ? `1` : (param->quot != `0`);
157	param->weight_adjust= param->wasign= `0`;
158	}
159
160	mwc=length=`0`;
161	for (word->pos= doc; doc < end; length++,
162	doc+= (mbl > `0` ? mbl : (mbl < `0` ? -mbl : `1`)))
163	{
164	mbl= cs->cset->ctype(cs, &ctype, (uchar)doc, (uchar)end);
165	if (true_word_char(ctype, *doc))
166	mwc=`0`;
167	else if (!misc_word_char(*doc) \|\| mwc)
168	break;
169	else
170	mwc++;
171	}
172	param->prev=`'A'`; / be sure prev is true_word_char /*
173	word->len= (uint)(doc-word->pos) - mwc;
174	if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
175	doc++;
176
177	if (((length >= ft_min_word_len && !is_stopword((char*) word->pos,
178	word->len))
179	\|\| param->trunc) && length < ft_max_word_len)
180	{
181	*start=doc;
182	param->type= FT_TOKEN_WORD;
183	goto ret;
184	}
185	else if (length) / make sure length > 0 (if start contains spaces only) /
186	{
187	*start= doc;
188	param->type= FT_TOKEN_STOPWORD;
189	goto ret;
190	}
191	}
192	if (param->quot)
193	{
194	*start= doc;
195	param->type= `3`; / FT_RBR /
196	goto ret;
197	}
198	ret:
199	return param->type;
200	}
201
202	uchar ft_simple_get_word(CHARSET_INFO cs, uchar start, const* uchar *end,
203	FT_WORD *word, my_bool skip_stopwords)
204	{
205	uchar doc= start;
206	uint mwc, length;
207	int mbl;
208	int ctype;
209	DBUG_ENTER("ft_simple_get_word");
210
211	do
212	{
213	for (;; doc+= (mbl > `0` ? mbl : (mbl < `0` ? -mbl : `1`)))
214	{
215	if (doc >= end)
216	DBUG_RETURN(`0`);
217	mbl= cs->cset->ctype(cs, &ctype, (uchar)doc, (uchar)end);
218	if (true_word_char(ctype, *doc))
219	break;
220	}
221
222	mwc= length= `0`;
223	for (word->pos= doc; doc < end; length++,
224	doc+= (mbl > `0` ? mbl : (mbl < `0` ? -mbl : `1`)))
225	{
226	mbl= cs->cset->ctype(cs, &ctype, (uchar)doc, (uchar)end);
227	if (true_word_char(ctype, *doc))
228	mwc= `0`;
229	else if (!misc_word_char(*doc) \|\| mwc)
230	break;
231	else
232	mwc++;
233	}
234
235	word->len= (uint)(doc-word->pos) - mwc;
236
237	if (skip_stopwords == FALSE \|\|
238	(length >= ft_min_word_len && length < ft_max_word_len &&
239	!is_stopword((char*) word->pos, word->len)))
240	{
241	*start= doc;
242	DBUG_RETURN(`1`);
243	}
244	} while (doc < end);
245	DBUG_RETURN(`0`);
246	}
247
248	void ft_parse_init(TREE wtree, CHARSET_INFO cs)
249	{
250	DBUG_ENTER("ft_parse_init");
251	if (!is_tree_inited(wtree))
252	init_tree(wtree, `0`, `0`, sizeof(FT_WORD), (qsort_cmp2)&FT_WORD_cmp, `0`,
253	(void*)cs, MYF(`0`));
254	DBUG_VOID_RETURN;
255	}
256
257
258	static int ft_add_word(MYSQL_FTPARSER_PARAM *param,
259	const char word, int* word_len,
260	MYSQL_FTPARSER_BOOLEAN_INFO boolean_info __attribute__*((unused)))
261	{
262	TREE *wtree;
263	FT_WORD w;
264	MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
265	DBUG_ENTER("ft_add_word");
266	wtree= ft_param->wtree;
267	if (param->flags & MYSQL_FTFLAGS_NEED_COPY)
268	{
269	uchar *ptr;
270	DBUG_ASSERT(wtree->with_delete == `0`);
271	ptr= (uchar *)alloc_root(ft_param->mem_root, word_len);
272	memcpy(ptr, word, word_len);
273	w.pos= ptr;
274	}
275	else
276	w.pos= (uchar*) word;
277	w.len= word_len;
278	if (!tree_insert(wtree, &w, `0`, wtree->custom_arg))
279	{
280	delete_tree(wtree, `0`);
281	DBUG_RETURN(`1`);
282	}
283	DBUG_RETURN(`0`);
284	}
285
286
287	static int ft_parse_internal(MYSQL_FTPARSER_PARAM *param,
288	const char doc_arg, int* doc_len)
289	{
290	uchar doc= (uchar) doc_arg;
291	uchar *end= doc + doc_len;
292	MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
293	TREE *wtree= ft_param->wtree;
294	FT_WORD w;
295	DBUG_ENTER("ft_parse_internal");
296
297	while (ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE))
298	if (param->mysql_add_word(param, (char) w.pos, (int*)w.len, `0`))
299	DBUG_RETURN(`1`);
300	DBUG_RETURN(`0`);
301	}
302
303
304	int ft_parse(TREE wtree, uchar doc, int doclen,
305	struct st_mysql_ftparser *parser,
306	MYSQL_FTPARSER_PARAM param, MEM_ROOT mem_root)
307	{
308	MY_FT_PARSER_PARAM my_param;
309	DBUG_ENTER("ft_parse");
310	DBUG_ASSERT(parser);
311
312	my_param.wtree= wtree;
313	my_param.mem_root= mem_root;
314
315	param->mysql_parse= ft_parse_internal;
316	param->mysql_add_word= ft_add_word;
317	param->mysql_ftparam= &my_param;
318	param->cs= wtree->custom_arg;
319	param->doc= (char*) doc;
320	param->length= doclen;
321	param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
322	DBUG_RETURN(parser->parse(param));
323	}
324
325
326	#define MAX_PARAM_NR 2
327
328	MYSQL_FTPARSER_PARAM* ftparser_alloc_param(MI_INFO *info)
329	{
330	if (!info->ftparser_param)
331	{
332	/*
333	. info->ftparser_param can not be zero after the initialization,
334	because it always includes built-in fulltext parser. And built-in
335	parser can be called even if the table has no fulltext indexes and
336	no varchar/text fields.
337
338	ftb_find_relevance... parser (ftb_find_relevance_parse,
339	ftb_find_relevance_add_word) calls ftb_check_phrase... parser
340	(ftb_check_phrase_internal, ftb_phrase_add_word). Thus MAX_PARAM_NR=2.
341	*/
342	info->ftparser_param= (MYSQL_FTPARSER_PARAM *)
343	my_malloc(MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) *
344	info->s->ftkeys, MYF(MY_WME \| MY_ZEROFILL));
345	init_alloc_root(&info->ft_memroot, "fulltext_parser",
346	FTPARSER_MEMROOT_ALLOC_SIZE, `0`, MYF(`0`));
347	}
348	return info->ftparser_param;
349	}
350
351
352	MYSQL_FTPARSER_PARAM ftparser_call_initializer(MI_INFO info,
353	uint keynr, uint paramnr)
354	{
355	uint32 ftparser_nr;
356	struct st_mysql_ftparser *parser;
357
358	if (!ftparser_alloc_param(info))
359	return `0`;
360
361	if (keynr == NO_SUCH_KEY)
362	{
363	ftparser_nr= `0`;
364	parser= &ft_default_parser;
365	}
366	else
367	{
368	ftparser_nr= info->s->keyinfo[keynr].ftkey_nr;
369	parser= info->s->keyinfo[keynr].parser;
370	}
371	DBUG_ASSERT(paramnr < MAX_PARAM_NR);
372	ftparser_nr= ftparser_nr*MAX_PARAM_NR + paramnr;
373	if (! info->ftparser_param[ftparser_nr].mysql_add_word)
374	{
375	/ Note, that mysql_add_word is used here as a flag:*
376	mysql_add_word == 0 - parser is not initialized
377	mysql_add_word != 0 - parser is initialized, or no
378	initialization needed. /*
379	info->ftparser_param[ftparser_nr].mysql_add_word=
380	(int ()(struct* st_mysql_ftparser_param , const* char , int*,
381	MYSQL_FTPARSER_BOOLEAN_INFO *)) `1`;
382	if (parser->init && parser->init(&info->ftparser_param[ftparser_nr]))
383	return `0`;
384	}
385	return &info->ftparser_param[ftparser_nr];
386	}
387
388	void ftparser_call_deinitializer(MI_INFO *info)
389	{
390	uint i, j, keys= info->s->state.header.keys;
391	free_root(&info->ft_memroot, MYF(`0`));
392	if (! info->ftparser_param)
393	return;
394	for (i= `0`; i < keys; i++)
395	{
396	MI_KEYDEF *keyinfo= &info->s->keyinfo[i];
397	for (j=`0`; j < MAX_PARAM_NR; j++)
398	{
399	MYSQL_FTPARSER_PARAM *ftparser_param=
400	&info->ftparser_param[keyinfo->ftkey_nr * MAX_PARAM_NR + j];
401	if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word)
402	{
403	if (keyinfo->parser->deinit)
404	keyinfo->parser->deinit(ftparser_param);
405	ftparser_param->mysql_add_word= `0`;
406	}
407	else
408	break;
409	}
410	}
411	}
412
413

Browse the source code of MariaDB/storage/myisam/ft_parser.c