token_cursor.c source code [MariaDB/storage/mroonga/vendor/groonga/lib/token_cursor.c]

1	/ -- c-basic-offset: 2 -- /
2	/*
3	Copyright(C) 2009-2017 Brazil
4
5	This library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License version 2.1 as published by the Free Software Foundation.
8
9	This library is distributed in the hope that it will be useful,
10	but WITHOUT ANY WARRANTY; without even the implied warranty of
11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12	Lesser General Public License for more details.
13
14	You should have received a copy of the GNU Lesser General Public
15	License along with this library; if not, write to the Free Software
16	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17	*/
18	#include "grn_token_cursor.h"
19	#include "grn_string.h"
20	#include "grn_pat.h"
21	#include "grn_dat.h"
22
23	static void
24	grn_token_cursor_open_initialize_token_filters(grn_ctx *ctx,
25	grn_token_cursor *token_cursor)
26	{
27	grn_obj *token_filters = token_cursor->token_filter.objects;
28	unsigned int i, n_token_filters;
29
30	token_cursor->token_filter.data = NULL;
31
32	if (token_filters) {
33	n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *);
34	} else {
35	n_token_filters = `0`;
36	}
37
38	if (n_token_filters == `0`) {
39	return;
40	}
41
42	token_cursor->token_filter.data = GRN_CALLOC(sizeof(void ) n_token_filters);
43	if (!token_cursor->token_filter.data) {
44	return;
45	}
46
47	for (i = `0`; i < n_token_filters; i++) {
48	grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i);
49	grn_proc token_filter = (grn_proc )token_filter_object;
50
51	token_cursor->token_filter.data[i] =
52	token_filter->callbacks.token_filter.init(ctx,
53	token_cursor->table,
54	token_cursor->mode);
55	}
56	}
57
58	grn_token_cursor *
59	grn_token_cursor_open(grn_ctx ctx, grn_obj table,
60	const char *str, size_t str_len,
61	grn_tokenize_mode mode, unsigned int flags)
62	{
63	grn_token_cursor *token_cursor;
64	grn_encoding encoding;
65	grn_obj *tokenizer;
66	grn_obj *normalizer;
67	grn_obj *token_filters;
68	grn_table_flags table_flags;
69	if (grn_table_get_info(ctx, table, &table_flags, &encoding, &tokenizer,
70	&normalizer, &token_filters)) {
71	return NULL;
72	}
73	if (!(token_cursor = GRN_MALLOC(sizeof(grn_token_cursor)))) { return NULL; }
74	token_cursor->table = table;
75	token_cursor->mode = mode;
76	token_cursor->encoding = encoding;
77	token_cursor->tokenizer = tokenizer;
78	token_cursor->token_filter.objects = token_filters;
79	token_cursor->token_filter.data = NULL;
80	token_cursor->orig = (const unsigned char *)str;
81	token_cursor->orig_blen = str_len;
82	token_cursor->curr = NULL;
83	token_cursor->nstr = NULL;
84	token_cursor->curr_size = `0`;
85	token_cursor->pos = -`1`;
86	token_cursor->status = GRN_TOKEN_CURSOR_DOING;
87	token_cursor->force_prefix = GRN_FALSE;
88	if (tokenizer) {
89	grn_obj str_, flags_, mode_;
90	GRN_TEXT_INIT(&str_, GRN_OBJ_DO_SHALLOW_COPY);
91	GRN_TEXT_SET_REF(&str_, str, str_len);
92	GRN_UINT32_INIT(&flags_, `0`);
93	GRN_UINT32_SET(ctx, &flags_, flags);
94	GRN_UINT32_INIT(&mode_, `0`);
95	GRN_UINT32_SET(ctx, &mode_, mode);
96	token_cursor->pctx.caller = NULL;
97	token_cursor->pctx.user_data.ptr = NULL;
98	token_cursor->pctx.proc = (grn_proc *)tokenizer;
99	token_cursor->pctx.hooks = NULL;
100	token_cursor->pctx.currh = NULL;
101	token_cursor->pctx.phase = PROC_INIT;
102	grn_ctx_push(ctx, &mode_);
103	grn_ctx_push(ctx, &str_);
104	grn_ctx_push(ctx, &flags_);
105	((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, `1`, &table, &token_cursor->pctx.user_data);
106	grn_obj_close(ctx, &flags_);
107	grn_obj_close(ctx, &str_);
108	grn_obj_close(ctx, &mode_);
109	} else {
110	int nflags = `0`;
111	token_cursor->nstr = grn_string_open_(ctx, str, str_len,
112	normalizer,
113	nflags,
114	token_cursor->encoding);
115	if (token_cursor->nstr) {
116	const char *normalized;
117	grn_string_get_normalized(ctx, token_cursor->nstr,
118	&normalized, &(token_cursor->curr_size), NULL);
119	token_cursor->curr = (const unsigned char *)normalized;
120	} else {
121	ERR(GRN_TOKENIZER_ERROR,
122	"[token-cursor][open] failed to grn_string_open()");
123	}
124	}
125
126	if (ctx->rc == GRN_SUCCESS) {
127	grn_token_cursor_open_initialize_token_filters(ctx, token_cursor);
128	}
129
130	if (ctx->rc) {
131	grn_token_cursor_close(ctx, token_cursor);
132	token_cursor = NULL;
133	}
134	return token_cursor;
135	}
136
137	static int
138	grn_token_cursor_next_apply_token_filters(grn_ctx *ctx,
139	grn_token_cursor *token_cursor,
140	grn_obj *current_token_data,
141	grn_obj *status)
142	{
143	grn_obj *token_filters = token_cursor->token_filter.objects;
144	unsigned int i, n_token_filters;
145	grn_token current_token;
146	grn_token next_token;
147
148	if (token_filters) {
149	n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *);
150	} else {
151	n_token_filters = `0`;
152	}
153
154	GRN_TEXT_INIT(&(current_token.data), GRN_OBJ_DO_SHALLOW_COPY);
155	GRN_TEXT_SET(ctx, &(current_token.data),
156	GRN_TEXT_VALUE(current_token_data),
157	GRN_TEXT_LEN(current_token_data));
158	current_token.status = GRN_INT32_VALUE(status);
159	GRN_TEXT_INIT(&(next_token.data), GRN_OBJ_DO_SHALLOW_COPY);
160	GRN_TEXT_SET(ctx, &(next_token.data),
161	GRN_TEXT_VALUE(&(current_token.data)),
162	GRN_TEXT_LEN(&(current_token.data)));
163	next_token.status = current_token.status;
164
165	for (i = `0`; i < n_token_filters; i++) {
166	grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i);
167	grn_proc token_filter = (grn_proc )token_filter_object;
168	void *data = token_cursor->token_filter.data[i];
169
170	#define SKIP_FLAGS\
171	(GRN_TOKEN_SKIP \|\
172	GRN_TOKEN_SKIP_WITH_POSITION)
173	if (current_token.status & SKIP_FLAGS) {
174	break;
175	}
176	#undef SKIP_FLAGS
177
178	token_filter->callbacks.token_filter.filter(ctx,
179	&current_token,
180	&next_token,
181	data);
182	GRN_TEXT_SET(ctx, &(current_token.data),
183	GRN_TEXT_VALUE(&(next_token.data)),
184	GRN_TEXT_LEN(&(next_token.data)));
185	current_token.status = next_token.status;
186	}
187
188	token_cursor->curr =
189	(const unsigned char *)GRN_TEXT_VALUE(&(current_token.data));
190	token_cursor->curr_size = GRN_TEXT_LEN(&(current_token.data));
191
192	return current_token.status;
193	}
194
195	grn_id
196	grn_token_cursor_next(grn_ctx ctx, grn_token_cursor token_cursor)
197	{
198	int status;
199	grn_id tid = GRN_ID_NIL;
200	grn_obj *table = token_cursor->table;
201	grn_obj *tokenizer = token_cursor->tokenizer;
202	while (token_cursor->status != GRN_TOKEN_CURSOR_DONE) {
203	if (tokenizer) {
204	grn_obj curr_, stat_;
205	((grn_proc *)tokenizer)->funcs[PROC_NEXT](ctx, `1`, &table, &token_cursor->pctx.user_data);
206	stat_ = grn_ctx_pop(ctx);
207	curr_ = grn_ctx_pop(ctx);
208	status = grn_token_cursor_next_apply_token_filters(ctx, token_cursor,
209	curr_, stat_);
210	token_cursor->status =
211	((status & GRN_TOKEN_LAST) \|\|
212	(token_cursor->mode == GRN_TOKENIZE_GET &&
213	(status & GRN_TOKEN_REACH_END)))
214	? GRN_TOKEN_CURSOR_DONE : GRN_TOKEN_CURSOR_DOING;
215	token_cursor->force_prefix = GRN_FALSE;
216	#define SKIP_FLAGS \
217	(GRN_TOKEN_SKIP \| GRN_TOKEN_SKIP_WITH_POSITION)
218	if (status & SKIP_FLAGS) {
219	if (status & GRN_TOKEN_SKIP) {
220	token_cursor->pos++;
221	}
222	if (token_cursor->status == GRN_TOKEN_CURSOR_DONE && tid == GRN_ID_NIL) {
223	token_cursor->status = GRN_TOKEN_CURSOR_DONE_SKIP;
224	break;
225	} else {
226	continue;
227	}
228	}
229	#undef SKIP_FLAGS
230	if (status & GRN_TOKEN_FORCE_PREFIX) {
231	token_cursor->force_prefix = GRN_TRUE;
232	}
233	if (token_cursor->curr_size == `0`) {
234	if (token_cursor->status != GRN_TOKEN_CURSOR_DONE) {
235	char tokenizer_name[GRN_TABLE_MAX_KEY_SIZE];
236	int tokenizer_name_length;
237	tokenizer_name_length =
238	grn_obj_name(ctx, token_cursor->tokenizer,
239	tokenizer_name, GRN_TABLE_MAX_KEY_SIZE);
240	GRN_LOG(ctx, GRN_WARN,
241	"[token_next] ignore an empty token: <%.s>: <%.s>",
242	tokenizer_name_length, tokenizer_name,
243	token_cursor->orig_blen, token_cursor->orig);
244	}
245	continue;
246	}
247	if (token_cursor->curr_size > GRN_TABLE_MAX_KEY_SIZE) {
248	GRN_LOG(ctx, GRN_WARN,
249	"[token_next] ignore too long token. "
250	"Token must be less than or equal to %d: <%d>(<%.*s>)",
251	GRN_TABLE_MAX_KEY_SIZE,
252	token_cursor->curr_size,
253	token_cursor->curr_size, token_cursor->curr);
254	continue;
255	}
256	if (status & GRN_TOKEN_UNMATURED) {
257	if (status & GRN_TOKEN_OVERLAP) {
258	if (token_cursor->mode == GRN_TOKENIZE_GET) {
259	token_cursor->pos++;
260	continue;
261	}
262	} else {
263	if (status & GRN_TOKEN_REACH_END) {
264	token_cursor->force_prefix = GRN_TRUE;
265	}
266	}
267	}
268	} else {
269	token_cursor->status = GRN_TOKEN_CURSOR_DONE;
270	}
271	if (token_cursor->mode == GRN_TOKENIZE_ADD) {
272	switch (table->header.type) {
273	case GRN_TABLE_PAT_KEY :
274	if (grn_io_lock(ctx, ((grn_pat *)table)->io, grn_lock_timeout)) {
275	tid = GRN_ID_NIL;
276	} else {
277	tid = grn_pat_add(ctx, (grn_pat *)table, token_cursor->curr, token_cursor->curr_size,
278	NULL, NULL);
279	grn_io_unlock(((grn_pat *)table)->io);
280	}
281	break;
282	case GRN_TABLE_DAT_KEY :
283	if (grn_io_lock(ctx, ((grn_dat *)table)->io, grn_lock_timeout)) {
284	tid = GRN_ID_NIL;
285	} else {
286	tid = grn_dat_add(ctx, (grn_dat *)table, token_cursor->curr, token_cursor->curr_size,
287	NULL, NULL);
288	grn_io_unlock(((grn_dat *)table)->io);
289	}
290	break;
291	case GRN_TABLE_HASH_KEY :
292	if (grn_io_lock(ctx, ((grn_hash *)table)->io, grn_lock_timeout)) {
293	tid = GRN_ID_NIL;
294	} else {
295	tid = grn_hash_add(ctx, (grn_hash *)table, token_cursor->curr, token_cursor->curr_size,
296	NULL, NULL);
297	grn_io_unlock(((grn_hash *)table)->io);
298	}
299	break;
300	case GRN_TABLE_NO_KEY :
301	if (token_cursor->curr_size == sizeof(grn_id)) {
302	tid = ((grn_id )token_cursor->curr);
303	} else {
304	tid = GRN_ID_NIL;
305	}
306	break;
307	}
308	} else if (token_cursor->mode != GRN_TOKENIZE_ONLY) {
309	switch (table->header.type) {
310	case GRN_TABLE_PAT_KEY :
311	tid = grn_pat_get(ctx, (grn_pat *)table, token_cursor->curr, token_cursor->curr_size, NULL);
312	break;
313	case GRN_TABLE_DAT_KEY :
314	tid = grn_dat_get(ctx, (grn_dat *)table, token_cursor->curr, token_cursor->curr_size, NULL);
315	break;
316	case GRN_TABLE_HASH_KEY :
317	tid = grn_hash_get(ctx, (grn_hash *)table, token_cursor->curr, token_cursor->curr_size, NULL);
318	break;
319	case GRN_TABLE_NO_KEY :
320	if (token_cursor->curr_size == sizeof(grn_id)) {
321	tid = ((grn_id )token_cursor->curr);
322	} else {
323	tid = GRN_ID_NIL;
324	}
325	break;
326	}
327	}
328	if (token_cursor->mode != GRN_TOKENIZE_ONLY &&
329	tid == GRN_ID_NIL && token_cursor->status != GRN_TOKEN_CURSOR_DONE) {
330	token_cursor->status = GRN_TOKEN_CURSOR_NOT_FOUND;
331	}
332	token_cursor->pos++;
333	break;
334	}
335	return tid;
336	}
337
338	static void
339	grn_token_cursor_close_token_filters(grn_ctx *ctx,
340	grn_token_cursor *token_cursor)
341	{
342	grn_obj *token_filters = token_cursor->token_filter.objects;
343	unsigned int i, n_token_filters;
344
345	if (!token_cursor->token_filter.data) {
346	return;
347	}
348
349	if (token_filters) {
350	n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *);
351	} else {
352	n_token_filters = `0`;
353	}
354
355	if (n_token_filters == `0`) {
356	return;
357	}
358
359	for (i = `0`; i < n_token_filters; i++) {
360	grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i);
361	grn_proc token_filter = (grn_proc )token_filter_object;
362	void *data = token_cursor->token_filter.data[i];
363
364	token_filter->callbacks.token_filter.fin(ctx, data);
365	}
366	GRN_FREE(token_cursor->token_filter.data);
367	}
368
369	grn_rc
370	grn_token_cursor_close(grn_ctx ctx, grn_token_cursor token_cursor)
371	{
372	if (token_cursor) {
373	if (token_cursor->tokenizer) {
374	((grn_proc *)token_cursor->tokenizer)->funcs[PROC_FIN](ctx, `1`, &token_cursor->table,
375	&token_cursor->pctx.user_data);
376	}
377	grn_token_cursor_close_token_filters(ctx, token_cursor);
378	if (token_cursor->nstr) {
379	grn_obj_close(ctx, token_cursor->nstr);
380	}
381	GRN_FREE(token_cursor);
382	return GRN_SUCCESS;
383	} else {
384	return GRN_INVALID_ARGUMENT;
385	}
386	}
387

Browse the source code of MariaDB/storage/mroonga/vendor/groonga/lib/token_cursor.c