1 | /* -*- c-basic-offset: 2 -*- */ |
2 | /* |
3 | Copyright(C) 2012-2014 Brazil |
4 | |
5 | This library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License version 2.1 as published by the Free Software Foundation. |
8 | |
9 | This library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Lesser General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU Lesser General Public |
15 | License along with this library; if not, write to the Free Software |
16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
17 | */ |
18 | #include "grn.h" |
19 | #include <groonga/tokenizer.h> |
20 | |
21 | #include <string.h> |
22 | |
23 | #include "grn_ctx.h" |
24 | #include "grn_db.h" |
25 | #include "grn_str.h" |
26 | #include "grn_string.h" |
27 | #include "grn_token_cursor.h" |
28 | |
29 | /* |
30 | Just for backward compatibility. See grn_plugin_charlen() instead. |
31 | */ |
32 | int |
33 | grn_tokenizer_charlen(grn_ctx *ctx, const char *str_ptr, |
34 | unsigned int str_length, grn_encoding encoding) |
35 | { |
36 | return grn_plugin_charlen(ctx, str_ptr, str_length, encoding); |
37 | } |
38 | |
39 | /* |
40 | Just for backward compatibility. See grn_plugin_isspace() instead. |
41 | */ |
42 | int |
43 | grn_tokenizer_isspace(grn_ctx *ctx, const char *str_ptr, |
44 | unsigned int str_length, grn_encoding encoding) |
45 | { |
46 | return grn_plugin_isspace(ctx, str_ptr, str_length, encoding); |
47 | } |
48 | |
49 | grn_bool |
50 | grn_tokenizer_is_tokenized_delimiter(grn_ctx *ctx, |
51 | const char *str_ptr, |
52 | unsigned int str_length, |
53 | grn_encoding encoding) |
54 | { |
55 | if (encoding != GRN_ENC_UTF8) { |
56 | return GRN_FALSE; |
57 | } |
58 | |
59 | if (str_length != GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN) { |
60 | return GRN_FALSE; |
61 | } |
62 | |
63 | return memcmp(str_ptr, |
64 | GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8, |
65 | GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN) == 0; |
66 | } |
67 | |
68 | grn_bool |
69 | grn_tokenizer_have_tokenized_delimiter(grn_ctx *ctx, |
70 | const char *str_ptr, |
71 | unsigned int str_length, |
72 | grn_encoding encoding) |
73 | { |
74 | int char_length; |
75 | const char *current = str_ptr; |
76 | const char *end = str_ptr + str_length; |
77 | |
78 | if (encoding != GRN_ENC_UTF8) { |
79 | return GRN_FALSE; |
80 | } |
81 | |
82 | if (str_length == 0) { |
83 | return GRN_FALSE; |
84 | } |
85 | |
86 | while ((char_length = grn_charlen_(ctx, current, end, encoding)) > 0) { |
87 | if (grn_tokenizer_is_tokenized_delimiter(ctx, |
88 | current, char_length, |
89 | encoding)) { |
90 | return GRN_TRUE; |
91 | } |
92 | current += char_length; |
93 | } |
94 | return GRN_FALSE; |
95 | } |
96 | |
97 | grn_tokenizer_query * |
98 | grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args, |
99 | unsigned int normalize_flags) |
100 | { |
101 | grn_obj *flags = grn_ctx_pop(ctx); |
102 | grn_obj *query_str = grn_ctx_pop(ctx); |
103 | grn_obj *tokenize_mode = grn_ctx_pop(ctx); |
104 | |
105 | if (query_str == NULL) { |
106 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument" ); |
107 | return NULL; |
108 | } |
109 | |
110 | if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) { |
111 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer" ); |
112 | return NULL; |
113 | } |
114 | |
115 | { |
116 | grn_tokenizer_query * const query = |
117 | GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query)); |
118 | if (query == NULL) { |
119 | return NULL; |
120 | } |
121 | query->normalized_query = NULL; |
122 | query->query_buf = NULL; |
123 | if (flags) { |
124 | query->flags = GRN_UINT32_VALUE(flags); |
125 | } else { |
126 | query->flags = 0; |
127 | } |
128 | if (tokenize_mode) { |
129 | query->tokenize_mode = GRN_UINT32_VALUE(tokenize_mode); |
130 | } else { |
131 | query->tokenize_mode = GRN_TOKENIZE_ADD; |
132 | } |
133 | query->token_mode = query->tokenize_mode; |
134 | |
135 | { |
136 | grn_obj * const table = args[0]; |
137 | grn_table_flags table_flags; |
138 | grn_encoding table_encoding; |
139 | unsigned int query_length = GRN_TEXT_LEN(query_str); |
140 | char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1); |
141 | grn_obj *normalizer = NULL; |
142 | |
143 | if (query_buf == NULL) { |
144 | GRN_PLUGIN_FREE(ctx, query); |
145 | GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, |
146 | "[tokenizer] failed to duplicate query" ); |
147 | return NULL; |
148 | } |
149 | grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL, |
150 | &normalizer, NULL); |
151 | { |
152 | grn_obj *normalized_query; |
153 | if (table_flags & GRN_OBJ_KEY_NORMALIZE) { |
154 | normalizer = GRN_NORMALIZER_AUTO; |
155 | } |
156 | normalized_query = grn_string_open_(ctx, |
157 | GRN_TEXT_VALUE(query_str), |
158 | GRN_TEXT_LEN(query_str), |
159 | normalizer, |
160 | normalize_flags, |
161 | table_encoding); |
162 | if (!normalized_query) { |
163 | GRN_PLUGIN_FREE(ctx, query_buf); |
164 | GRN_PLUGIN_FREE(ctx, query); |
165 | GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, |
166 | "[tokenizer] failed to open normalized string" ); |
167 | return NULL; |
168 | } |
169 | query->normalized_query = normalized_query; |
170 | grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length); |
171 | query_buf[query_length] = '\0'; |
172 | query->query_buf = query_buf; |
173 | query->ptr = query_buf; |
174 | query->length = query_length; |
175 | } |
176 | query->encoding = table_encoding; |
177 | |
178 | if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) { |
179 | const char *normalized_string; |
180 | unsigned int normalized_string_length; |
181 | |
182 | grn_string_get_normalized(ctx, |
183 | query->normalized_query, |
184 | &normalized_string, |
185 | &normalized_string_length, |
186 | NULL); |
187 | query->have_tokenized_delimiter = |
188 | grn_tokenizer_have_tokenized_delimiter(ctx, |
189 | normalized_string, |
190 | normalized_string_length, |
191 | query->encoding); |
192 | } else { |
193 | query->have_tokenized_delimiter = GRN_FALSE; |
194 | } |
195 | } |
196 | return query; |
197 | } |
198 | } |
199 | |
200 | grn_tokenizer_query * |
201 | grn_tokenizer_query_create(grn_ctx *ctx, int num_args, grn_obj **args) |
202 | { |
203 | return grn_tokenizer_query_open(ctx, num_args, args, 0); |
204 | } |
205 | |
206 | void |
207 | grn_tokenizer_query_close(grn_ctx *ctx, grn_tokenizer_query *query) |
208 | { |
209 | if (query != NULL) { |
210 | if (query->normalized_query != NULL) { |
211 | grn_obj_unlink(ctx, query->normalized_query); |
212 | } |
213 | if (query->query_buf != NULL) { |
214 | GRN_PLUGIN_FREE(ctx, query->query_buf); |
215 | } |
216 | GRN_PLUGIN_FREE(ctx, query); |
217 | } |
218 | } |
219 | |
220 | void |
221 | grn_tokenizer_query_destroy(grn_ctx *ctx, grn_tokenizer_query *query) |
222 | { |
223 | grn_tokenizer_query_close(ctx, query); |
224 | } |
225 | |
226 | void |
227 | grn_tokenizer_token_init(grn_ctx *ctx, grn_tokenizer_token *token) |
228 | { |
229 | GRN_TEXT_INIT(&token->str, GRN_OBJ_DO_SHALLOW_COPY); |
230 | GRN_UINT32_INIT(&token->status, 0); |
231 | } |
232 | |
233 | void |
234 | grn_tokenizer_token_fin(grn_ctx *ctx, grn_tokenizer_token *token) |
235 | { |
236 | GRN_OBJ_FIN(ctx, &(token->str)); |
237 | GRN_OBJ_FIN(ctx, &(token->status)); |
238 | } |
239 | |
240 | void |
241 | grn_tokenizer_token_push(grn_ctx *ctx, grn_tokenizer_token *token, |
242 | const char *str_ptr, unsigned int str_length, |
243 | grn_token_status status) |
244 | { |
245 | GRN_TEXT_SET_REF(&token->str, str_ptr, str_length); |
246 | GRN_UINT32_SET(ctx, &token->status, status); |
247 | grn_ctx_push(ctx, &token->str); |
248 | grn_ctx_push(ctx, &token->status); |
249 | } |
250 | |
251 | const char * |
252 | grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx, |
253 | grn_tokenizer_token *token, |
254 | const char *str_ptr, |
255 | unsigned int str_length, |
256 | grn_encoding encoding) |
257 | { |
258 | size_t char_length = 0; |
259 | const char *start = str_ptr; |
260 | const char *current; |
261 | const char *end = str_ptr + str_length; |
262 | const char *next_start = NULL; |
263 | unsigned int token_length; |
264 | grn_token_status status; |
265 | |
266 | for (current = start; current < end; current += char_length) { |
267 | char_length = grn_charlen_(ctx, current, end, encoding); |
268 | if (char_length == 0) { |
269 | break; |
270 | } |
271 | if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length, |
272 | encoding)) { |
273 | next_start = str_ptr + (current - start + char_length); |
274 | break; |
275 | } |
276 | } |
277 | |
278 | token_length = current - start; |
279 | if (current == end) { |
280 | status = GRN_TOKENIZER_LAST; |
281 | } else { |
282 | status = GRN_TOKENIZER_CONTINUE; |
283 | } |
284 | grn_tokenizer_token_push(ctx, token, start, token_length, status); |
285 | |
286 | return next_start; |
287 | } |
288 | |
289 | grn_rc |
290 | grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr, |
291 | unsigned int plugin_name_length, |
292 | grn_proc_func *init, grn_proc_func *next, |
293 | grn_proc_func *fin) |
294 | { |
295 | grn_expr_var vars[] = { |
296 | { NULL, 0 }, |
297 | { NULL, 0 }, |
298 | { NULL, 0 } |
299 | }; |
300 | GRN_TEXT_INIT(&vars[0].value, 0); |
301 | GRN_TEXT_INIT(&vars[1].value, 0); |
302 | GRN_UINT32_INIT(&vars[2].value, 0); |
303 | |
304 | { |
305 | /* |
306 | grn_proc_create() registers a plugin to the database which is associated |
307 | with `ctx'. A returned object must not be finalized here. |
308 | */ |
309 | grn_obj * const obj = grn_proc_create(ctx, plugin_name_ptr, |
310 | plugin_name_length, |
311 | GRN_PROC_TOKENIZER, |
312 | init, next, fin, 3, vars); |
313 | if (obj == NULL) { |
314 | GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "grn_proc_create() failed" ); |
315 | return ctx->rc; |
316 | } |
317 | } |
318 | return GRN_SUCCESS; |
319 | } |
320 | |
321 | grn_obj * |
322 | grn_token_get_data(grn_ctx *ctx, grn_token *token) |
323 | { |
324 | GRN_API_ENTER; |
325 | if (!token) { |
326 | ERR(GRN_INVALID_ARGUMENT, "token must not be NULL" ); |
327 | GRN_API_RETURN(NULL); |
328 | } |
329 | GRN_API_RETURN(&(token->data)); |
330 | } |
331 | |
332 | grn_rc |
333 | grn_token_set_data(grn_ctx *ctx, |
334 | grn_token *token, |
335 | const char *str_ptr, |
336 | int str_length) |
337 | { |
338 | GRN_API_ENTER; |
339 | if (!token) { |
340 | ERR(GRN_INVALID_ARGUMENT, "token must not be NULL" ); |
341 | goto exit; |
342 | } |
343 | if (str_length == -1) { |
344 | str_length = strlen(str_ptr); |
345 | } |
346 | GRN_TEXT_SET(ctx, &(token->data), str_ptr, str_length); |
347 | exit: |
348 | GRN_API_RETURN(ctx->rc); |
349 | } |
350 | |
351 | grn_token_status |
352 | grn_token_get_status(grn_ctx *ctx, grn_token *token) |
353 | { |
354 | GRN_API_ENTER; |
355 | if (!token) { |
356 | ERR(GRN_INVALID_ARGUMENT, "token must not be NULL" ); |
357 | GRN_API_RETURN(GRN_TOKEN_CONTINUE); |
358 | } |
359 | GRN_API_RETURN(token->status); |
360 | } |
361 | |
362 | grn_rc |
363 | grn_token_set_status(grn_ctx *ctx, |
364 | grn_token *token, |
365 | grn_token_status status) |
366 | { |
367 | GRN_API_ENTER; |
368 | if (!token) { |
369 | ERR(GRN_INVALID_ARGUMENT, "token must not be NULL" ); |
370 | goto exit; |
371 | } |
372 | token->status = status; |
373 | exit: |
374 | GRN_API_RETURN(ctx->rc); |
375 | } |
376 | |