1/* -*- c-basic-offset: 2 -*- */
2/*
3 Copyright(C) 2012-2014 Brazil
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License version 2.1 as published by the Free Software Foundation.
8
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public
15 License along with this library; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18#include "grn.h"
19#include <groonga/tokenizer.h>
20
21#include <string.h>
22
23#include "grn_ctx.h"
24#include "grn_db.h"
25#include "grn_str.h"
26#include "grn_string.h"
27#include "grn_token_cursor.h"
28
29/*
30 Just for backward compatibility. See grn_plugin_charlen() instead.
31 */
32int
33grn_tokenizer_charlen(grn_ctx *ctx, const char *str_ptr,
34 unsigned int str_length, grn_encoding encoding)
35{
36 return grn_plugin_charlen(ctx, str_ptr, str_length, encoding);
37}
38
39/*
40 Just for backward compatibility. See grn_plugin_isspace() instead.
41 */
42int
43grn_tokenizer_isspace(grn_ctx *ctx, const char *str_ptr,
44 unsigned int str_length, grn_encoding encoding)
45{
46 return grn_plugin_isspace(ctx, str_ptr, str_length, encoding);
47}
48
49grn_bool
50grn_tokenizer_is_tokenized_delimiter(grn_ctx *ctx,
51 const char *str_ptr,
52 unsigned int str_length,
53 grn_encoding encoding)
54{
55 if (encoding != GRN_ENC_UTF8) {
56 return GRN_FALSE;
57 }
58
59 if (str_length != GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN) {
60 return GRN_FALSE;
61 }
62
63 return memcmp(str_ptr,
64 GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8,
65 GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN) == 0;
66}
67
68grn_bool
69grn_tokenizer_have_tokenized_delimiter(grn_ctx *ctx,
70 const char *str_ptr,
71 unsigned int str_length,
72 grn_encoding encoding)
73{
74 int char_length;
75 const char *current = str_ptr;
76 const char *end = str_ptr + str_length;
77
78 if (encoding != GRN_ENC_UTF8) {
79 return GRN_FALSE;
80 }
81
82 if (str_length == 0) {
83 return GRN_FALSE;
84 }
85
86 while ((char_length = grn_charlen_(ctx, current, end, encoding)) > 0) {
87 if (grn_tokenizer_is_tokenized_delimiter(ctx,
88 current, char_length,
89 encoding)) {
90 return GRN_TRUE;
91 }
92 current += char_length;
93 }
94 return GRN_FALSE;
95}
96
97grn_tokenizer_query *
98grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args,
99 unsigned int normalize_flags)
100{
101 grn_obj *flags = grn_ctx_pop(ctx);
102 grn_obj *query_str = grn_ctx_pop(ctx);
103 grn_obj *tokenize_mode = grn_ctx_pop(ctx);
104
105 if (query_str == NULL) {
106 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument");
107 return NULL;
108 }
109
110 if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) {
111 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer");
112 return NULL;
113 }
114
115 {
116 grn_tokenizer_query * const query =
117 GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query));
118 if (query == NULL) {
119 return NULL;
120 }
121 query->normalized_query = NULL;
122 query->query_buf = NULL;
123 if (flags) {
124 query->flags = GRN_UINT32_VALUE(flags);
125 } else {
126 query->flags = 0;
127 }
128 if (tokenize_mode) {
129 query->tokenize_mode = GRN_UINT32_VALUE(tokenize_mode);
130 } else {
131 query->tokenize_mode = GRN_TOKENIZE_ADD;
132 }
133 query->token_mode = query->tokenize_mode;
134
135 {
136 grn_obj * const table = args[0];
137 grn_table_flags table_flags;
138 grn_encoding table_encoding;
139 unsigned int query_length = GRN_TEXT_LEN(query_str);
140 char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1);
141 grn_obj *normalizer = NULL;
142
143 if (query_buf == NULL) {
144 GRN_PLUGIN_FREE(ctx, query);
145 GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
146 "[tokenizer] failed to duplicate query");
147 return NULL;
148 }
149 grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL,
150 &normalizer, NULL);
151 {
152 grn_obj *normalized_query;
153 if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
154 normalizer = GRN_NORMALIZER_AUTO;
155 }
156 normalized_query = grn_string_open_(ctx,
157 GRN_TEXT_VALUE(query_str),
158 GRN_TEXT_LEN(query_str),
159 normalizer,
160 normalize_flags,
161 table_encoding);
162 if (!normalized_query) {
163 GRN_PLUGIN_FREE(ctx, query_buf);
164 GRN_PLUGIN_FREE(ctx, query);
165 GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
166 "[tokenizer] failed to open normalized string");
167 return NULL;
168 }
169 query->normalized_query = normalized_query;
170 grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length);
171 query_buf[query_length] = '\0';
172 query->query_buf = query_buf;
173 query->ptr = query_buf;
174 query->length = query_length;
175 }
176 query->encoding = table_encoding;
177
178 if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) {
179 const char *normalized_string;
180 unsigned int normalized_string_length;
181
182 grn_string_get_normalized(ctx,
183 query->normalized_query,
184 &normalized_string,
185 &normalized_string_length,
186 NULL);
187 query->have_tokenized_delimiter =
188 grn_tokenizer_have_tokenized_delimiter(ctx,
189 normalized_string,
190 normalized_string_length,
191 query->encoding);
192 } else {
193 query->have_tokenized_delimiter = GRN_FALSE;
194 }
195 }
196 return query;
197 }
198}
199
200grn_tokenizer_query *
201grn_tokenizer_query_create(grn_ctx *ctx, int num_args, grn_obj **args)
202{
203 return grn_tokenizer_query_open(ctx, num_args, args, 0);
204}
205
206void
207grn_tokenizer_query_close(grn_ctx *ctx, grn_tokenizer_query *query)
208{
209 if (query != NULL) {
210 if (query->normalized_query != NULL) {
211 grn_obj_unlink(ctx, query->normalized_query);
212 }
213 if (query->query_buf != NULL) {
214 GRN_PLUGIN_FREE(ctx, query->query_buf);
215 }
216 GRN_PLUGIN_FREE(ctx, query);
217 }
218}
219
220void
221grn_tokenizer_query_destroy(grn_ctx *ctx, grn_tokenizer_query *query)
222{
223 grn_tokenizer_query_close(ctx, query);
224}
225
226void
227grn_tokenizer_token_init(grn_ctx *ctx, grn_tokenizer_token *token)
228{
229 GRN_TEXT_INIT(&token->str, GRN_OBJ_DO_SHALLOW_COPY);
230 GRN_UINT32_INIT(&token->status, 0);
231}
232
233void
234grn_tokenizer_token_fin(grn_ctx *ctx, grn_tokenizer_token *token)
235{
236 GRN_OBJ_FIN(ctx, &(token->str));
237 GRN_OBJ_FIN(ctx, &(token->status));
238}
239
240void
241grn_tokenizer_token_push(grn_ctx *ctx, grn_tokenizer_token *token,
242 const char *str_ptr, unsigned int str_length,
243 grn_token_status status)
244{
245 GRN_TEXT_SET_REF(&token->str, str_ptr, str_length);
246 GRN_UINT32_SET(ctx, &token->status, status);
247 grn_ctx_push(ctx, &token->str);
248 grn_ctx_push(ctx, &token->status);
249}
250
251const char *
252grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx,
253 grn_tokenizer_token *token,
254 const char *str_ptr,
255 unsigned int str_length,
256 grn_encoding encoding)
257{
258 size_t char_length = 0;
259 const char *start = str_ptr;
260 const char *current;
261 const char *end = str_ptr + str_length;
262 const char *next_start = NULL;
263 unsigned int token_length;
264 grn_token_status status;
265
266 for (current = start; current < end; current += char_length) {
267 char_length = grn_charlen_(ctx, current, end, encoding);
268 if (char_length == 0) {
269 break;
270 }
271 if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length,
272 encoding)) {
273 next_start = str_ptr + (current - start + char_length);
274 break;
275 }
276 }
277
278 token_length = current - start;
279 if (current == end) {
280 status = GRN_TOKENIZER_LAST;
281 } else {
282 status = GRN_TOKENIZER_CONTINUE;
283 }
284 grn_tokenizer_token_push(ctx, token, start, token_length, status);
285
286 return next_start;
287}
288
289grn_rc
290grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr,
291 unsigned int plugin_name_length,
292 grn_proc_func *init, grn_proc_func *next,
293 grn_proc_func *fin)
294{
295 grn_expr_var vars[] = {
296 { NULL, 0 },
297 { NULL, 0 },
298 { NULL, 0 }
299 };
300 GRN_TEXT_INIT(&vars[0].value, 0);
301 GRN_TEXT_INIT(&vars[1].value, 0);
302 GRN_UINT32_INIT(&vars[2].value, 0);
303
304 {
305 /*
306 grn_proc_create() registers a plugin to the database which is associated
307 with `ctx'. A returned object must not be finalized here.
308 */
309 grn_obj * const obj = grn_proc_create(ctx, plugin_name_ptr,
310 plugin_name_length,
311 GRN_PROC_TOKENIZER,
312 init, next, fin, 3, vars);
313 if (obj == NULL) {
314 GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "grn_proc_create() failed");
315 return ctx->rc;
316 }
317 }
318 return GRN_SUCCESS;
319}
320
321grn_obj *
322grn_token_get_data(grn_ctx *ctx, grn_token *token)
323{
324 GRN_API_ENTER;
325 if (!token) {
326 ERR(GRN_INVALID_ARGUMENT, "token must not be NULL");
327 GRN_API_RETURN(NULL);
328 }
329 GRN_API_RETURN(&(token->data));
330}
331
332grn_rc
333grn_token_set_data(grn_ctx *ctx,
334 grn_token *token,
335 const char *str_ptr,
336 int str_length)
337{
338 GRN_API_ENTER;
339 if (!token) {
340 ERR(GRN_INVALID_ARGUMENT, "token must not be NULL");
341 goto exit;
342 }
343 if (str_length == -1) {
344 str_length = strlen(str_ptr);
345 }
346 GRN_TEXT_SET(ctx, &(token->data), str_ptr, str_length);
347exit:
348 GRN_API_RETURN(ctx->rc);
349}
350
351grn_token_status
352grn_token_get_status(grn_ctx *ctx, grn_token *token)
353{
354 GRN_API_ENTER;
355 if (!token) {
356 ERR(GRN_INVALID_ARGUMENT, "token must not be NULL");
357 GRN_API_RETURN(GRN_TOKEN_CONTINUE);
358 }
359 GRN_API_RETURN(token->status);
360}
361
362grn_rc
363grn_token_set_status(grn_ctx *ctx,
364 grn_token *token,
365 grn_token_status status)
366{
367 GRN_API_ENTER;
368 if (!token) {
369 ERR(GRN_INVALID_ARGUMENT, "token must not be NULL");
370 goto exit;
371 }
372 token->status = status;
373exit:
374 GRN_API_RETURN(ctx->rc);
375}
376