1 | /* -*- c-basic-offset: 2 -*- */ |
2 | /* |
3 | Copyright(C) 2009-2012 Brazil |
4 | |
5 | This library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License version 2.1 as published by the Free Software Foundation. |
8 | |
9 | This library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Lesser General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU Lesser General Public |
15 | License along with this library; if not, write to the Free Software |
16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
17 | */ |
18 | |
19 | #include "grn.h" |
20 | #include <string.h> |
21 | #include "grn_string.h" |
22 | #include "grn_normalizer.h" |
23 | #include "grn_str.h" |
24 | #include "grn_util.h" |
25 | |
26 | #include <groonga/tokenizer.h> |
27 | |
28 | static grn_string * |
29 | grn_fake_string_open(grn_ctx *ctx, grn_string *string) |
30 | { |
31 | /* TODO: support GRN_STRING_REMOVE_BLANK flag and ctypes */ |
32 | grn_string *nstr = string; |
33 | const char *str; |
34 | unsigned int str_len; |
35 | |
36 | str = nstr->original; |
37 | str_len = nstr->original_length_in_bytes; |
38 | |
39 | if (!(nstr->normalized = GRN_MALLOC(str_len + 1))) { |
40 | ERR(GRN_NO_MEMORY_AVAILABLE, |
41 | "[strinig][fake] failed to allocate normalized text space" ); |
42 | grn_string_close(ctx, (grn_obj *)nstr); |
43 | return NULL; |
44 | } |
45 | |
46 | if (nstr->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER && |
47 | ctx->encoding == GRN_ENC_UTF8) { |
48 | int char_length; |
49 | const char *source_current = str; |
50 | const char *source_end = str + str_len; |
51 | char *destination = nstr->normalized; |
52 | unsigned int destination_length = 0; |
53 | while ((char_length = grn_charlen(ctx, source_current, source_end)) > 0) { |
54 | if (!grn_tokenizer_is_tokenized_delimiter(ctx, |
55 | source_current, char_length, |
56 | ctx->encoding)) { |
57 | grn_memcpy(destination, source_current, char_length); |
58 | destination += char_length; |
59 | destination_length += char_length; |
60 | } |
61 | source_current += char_length; |
62 | } |
63 | nstr->normalized[destination_length] = '\0'; |
64 | nstr->normalized_length_in_bytes = destination_length; |
65 | } else { |
66 | grn_memcpy(nstr->normalized, str, str_len); |
67 | nstr->normalized[str_len] = '\0'; |
68 | nstr->normalized_length_in_bytes = str_len; |
69 | } |
70 | |
71 | if (nstr->flags & GRN_STRING_WITH_CHECKS) { |
72 | int16_t f = 0; |
73 | unsigned char c; |
74 | size_t i; |
75 | if (!(nstr->checks = (int16_t *) GRN_MALLOC(sizeof(int16_t) * str_len))) { |
76 | grn_string_close(ctx, (grn_obj *)nstr); |
77 | ERR(GRN_NO_MEMORY_AVAILABLE, |
78 | "[strinig][fake] failed to allocate checks space" ); |
79 | return NULL; |
80 | } |
81 | switch (nstr->encoding) { |
82 | case GRN_ENC_EUC_JP: |
83 | for (i = 0; i < str_len; i++) { |
84 | if (!f) { |
85 | c = (unsigned char) str[i]; |
86 | f = ((c >= 0xa1U && c <= 0xfeU) || c == 0x8eU ? 2 : (c == 0x8fU ? 3 : 1) |
87 | ); |
88 | nstr->checks[i] = f; |
89 | } else { |
90 | nstr->checks[i] = 0; |
91 | } |
92 | f--; |
93 | } |
94 | break; |
95 | case GRN_ENC_SJIS: |
96 | for (i = 0; i < str_len; i++) { |
97 | if (!f) { |
98 | c = (unsigned char) str[i]; |
99 | f = (c >= 0x81U && ((c <= 0x9fU) || (c >= 0xe0U && c <= 0xfcU)) ? 2 : 1); |
100 | nstr->checks[i] = f; |
101 | } else { |
102 | nstr->checks[i] = 0; |
103 | } |
104 | f--; |
105 | } |
106 | break; |
107 | case GRN_ENC_UTF8: |
108 | for (i = 0; i < str_len; i++) { |
109 | if (!f) { |
110 | c = (unsigned char) str[i]; |
111 | f = (c & 0x80U ? (c & 0x20U ? (c & 0x10U ? 4 : 3) |
112 | : 2) |
113 | : 1); |
114 | nstr->checks[i] = f; |
115 | } else { |
116 | nstr->checks[i] = 0; |
117 | } |
118 | f--; |
119 | } |
120 | break; |
121 | default: |
122 | for (i = 0; i < str_len; i++) { |
123 | nstr->checks[i] = 1; |
124 | } |
125 | break; |
126 | } |
127 | } |
128 | return nstr; |
129 | } |
130 | |
131 | grn_obj * |
132 | grn_string_open_(grn_ctx *ctx, const char *str, unsigned int str_len, |
133 | grn_obj *normalizer, int flags, grn_encoding encoding) |
134 | { |
135 | grn_string *string; |
136 | grn_obj *obj; |
137 | grn_bool is_normalizer_auto; |
138 | |
139 | if (!str || !str_len) { |
140 | return NULL; |
141 | } |
142 | |
143 | is_normalizer_auto = (normalizer == GRN_NORMALIZER_AUTO); |
144 | if (is_normalizer_auto) { |
145 | normalizer = grn_ctx_get(ctx, GRN_NORMALIZER_AUTO_NAME, -1); |
146 | if (!normalizer) { |
147 | ERR(GRN_INVALID_ARGUMENT, |
148 | "[string][open] NormalizerAuto normalizer isn't available" ); |
149 | return NULL; |
150 | } |
151 | } |
152 | |
153 | string = GRN_MALLOCN(grn_string, 1); |
154 | if (!string) { |
155 | if (is_normalizer_auto) { |
156 | grn_obj_unlink(ctx, normalizer); |
157 | } |
158 | GRN_LOG(ctx, GRN_LOG_ALERT, |
159 | "[string][open] failed to allocate memory" ); |
160 | return NULL; |
161 | } |
162 | |
163 | obj = (grn_obj *)string; |
164 | GRN_OBJ_INIT(obj, GRN_STRING, GRN_OBJ_ALLOCATED, GRN_ID_NIL); |
165 | string->original = str; |
166 | string->original_length_in_bytes = str_len; |
167 | string->normalized = NULL; |
168 | string->normalized_length_in_bytes = 0; |
169 | string->n_characters = 0; |
170 | string->checks = NULL; |
171 | string->ctypes = NULL; |
172 | string->encoding = encoding; |
173 | string->flags = flags; |
174 | |
175 | if (!normalizer) { |
176 | return (grn_obj *)grn_fake_string_open(ctx, string); |
177 | } |
178 | |
179 | grn_normalizer_normalize(ctx, normalizer, (grn_obj *)string); |
180 | if (ctx->rc) { |
181 | grn_obj_close(ctx, obj); |
182 | obj = NULL; |
183 | } |
184 | |
185 | if (is_normalizer_auto) { |
186 | grn_obj_unlink(ctx, normalizer); |
187 | } |
188 | |
189 | return obj; |
190 | } |
191 | |
192 | grn_obj * |
193 | grn_string_open(grn_ctx *ctx, const char *str, unsigned int str_len, |
194 | grn_obj *normalizer, int flags) |
195 | { |
196 | return grn_string_open_(ctx, str, str_len, normalizer, flags, ctx->encoding); |
197 | } |
198 | |
199 | grn_rc |
200 | grn_string_get_original(grn_ctx *ctx, grn_obj *string, |
201 | const char **original, |
202 | unsigned int *length_in_bytes) |
203 | { |
204 | grn_rc rc; |
205 | grn_string *string_ = (grn_string *)string; |
206 | GRN_API_ENTER; |
207 | if (string_) { |
208 | if (original) { *original = string_->original; } |
209 | if (length_in_bytes) { |
210 | *length_in_bytes = string_->original_length_in_bytes; |
211 | } |
212 | rc = GRN_SUCCESS; |
213 | } else { |
214 | rc = GRN_INVALID_ARGUMENT; |
215 | } |
216 | GRN_API_RETURN(rc); |
217 | } |
218 | |
219 | int |
220 | grn_string_get_flags(grn_ctx *ctx, grn_obj *string) |
221 | { |
222 | int flags = 0; |
223 | grn_string *string_ = (grn_string *)string; |
224 | GRN_API_ENTER; |
225 | if (string_) { |
226 | flags = string_->flags; |
227 | } |
228 | GRN_API_RETURN(flags); |
229 | } |
230 | |
231 | grn_rc |
232 | grn_string_get_normalized(grn_ctx *ctx, grn_obj *string, |
233 | const char **normalized, |
234 | unsigned int *length_in_bytes, |
235 | unsigned int *n_characters) |
236 | { |
237 | grn_rc rc; |
238 | grn_string *string_ = (grn_string *)string; |
239 | GRN_API_ENTER; |
240 | if (string_) { |
241 | if (normalized) { *normalized = string_->normalized; } |
242 | if (length_in_bytes) { |
243 | *length_in_bytes = string_->normalized_length_in_bytes; |
244 | } |
245 | if (n_characters) { *n_characters = string_->n_characters; } |
246 | rc = GRN_SUCCESS; |
247 | } else { |
248 | if (normalized) { *normalized = NULL; } |
249 | if (length_in_bytes) { *length_in_bytes = 0; } |
250 | if (n_characters) { *n_characters = 0; } |
251 | rc = GRN_INVALID_ARGUMENT; |
252 | } |
253 | GRN_API_RETURN(rc); |
254 | } |
255 | |
256 | grn_rc |
257 | grn_string_set_normalized(grn_ctx *ctx, grn_obj *string, |
258 | char *normalized, unsigned int length_in_bytes, |
259 | unsigned int n_characters) |
260 | { |
261 | grn_rc rc; |
262 | grn_string *string_ = (grn_string *)string; |
263 | GRN_API_ENTER; |
264 | if (string_) { |
265 | if (string_->normalized) { GRN_FREE(string_->normalized); } |
266 | string_->normalized = normalized; |
267 | string_->normalized_length_in_bytes = length_in_bytes; |
268 | string_->n_characters = n_characters; |
269 | rc = GRN_SUCCESS; |
270 | } else { |
271 | rc = GRN_INVALID_ARGUMENT; |
272 | } |
273 | GRN_API_RETURN(rc); |
274 | } |
275 | |
276 | const short * |
277 | grn_string_get_checks(grn_ctx *ctx, grn_obj *string) |
278 | { |
279 | int16_t *checks = NULL; |
280 | grn_string *string_ = (grn_string *)string; |
281 | GRN_API_ENTER; |
282 | if (string_) { |
283 | checks = string_->checks; |
284 | } else { |
285 | checks = NULL; |
286 | } |
287 | GRN_API_RETURN(checks); |
288 | } |
289 | |
290 | grn_rc |
291 | grn_string_set_checks(grn_ctx *ctx, grn_obj *string, short *checks) |
292 | { |
293 | grn_rc rc; |
294 | grn_string *string_ = (grn_string *)string; |
295 | GRN_API_ENTER; |
296 | if (string_) { |
297 | if (string_->checks) { GRN_FREE(string_->checks); } |
298 | string_->checks = checks; |
299 | rc = GRN_SUCCESS; |
300 | } else { |
301 | rc = GRN_INVALID_ARGUMENT; |
302 | } |
303 | GRN_API_RETURN(rc); |
304 | } |
305 | |
306 | const unsigned char * |
307 | grn_string_get_types(grn_ctx *ctx, grn_obj *string) |
308 | { |
309 | unsigned char *types = NULL; |
310 | grn_string *string_ = (grn_string *)string; |
311 | GRN_API_ENTER; |
312 | if (string_) { |
313 | types = string_->ctypes; |
314 | } else { |
315 | types = NULL; |
316 | } |
317 | GRN_API_RETURN(types); |
318 | } |
319 | |
320 | grn_rc |
321 | grn_string_set_types(grn_ctx *ctx, grn_obj *string, unsigned char *types) |
322 | { |
323 | grn_rc rc; |
324 | grn_string *string_ = (grn_string *)string; |
325 | GRN_API_ENTER; |
326 | if (string_) { |
327 | if (string_->ctypes) { GRN_FREE(string_->ctypes); } |
328 | string_->ctypes = types; |
329 | rc = GRN_SUCCESS; |
330 | } else { |
331 | rc = GRN_INVALID_ARGUMENT; |
332 | } |
333 | GRN_API_RETURN(rc); |
334 | } |
335 | |
336 | grn_encoding |
337 | grn_string_get_encoding(grn_ctx *ctx, grn_obj *string) |
338 | { |
339 | grn_encoding encoding = GRN_ENC_NONE; |
340 | grn_string *string_ = (grn_string *)string; |
341 | GRN_API_ENTER; |
342 | if (string_) { |
343 | encoding = string_->encoding; |
344 | } |
345 | GRN_API_RETURN(encoding); |
346 | } |
347 | |
348 | grn_rc |
349 | grn_string_inspect(grn_ctx *ctx, grn_obj *buffer, grn_obj *string) |
350 | { |
351 | grn_string *string_ = (grn_string *)string; |
352 | |
353 | GRN_TEXT_PUTS(ctx, buffer, "#<string:" ); |
354 | |
355 | GRN_TEXT_PUTS(ctx, buffer, " original:<" ); |
356 | GRN_TEXT_PUT(ctx, buffer, |
357 | string_->original, |
358 | string_->original_length_in_bytes); |
359 | GRN_TEXT_PUTS(ctx, buffer, ">" ); |
360 | GRN_TEXT_PUTS(ctx, buffer, "(" ); |
361 | grn_text_itoa(ctx, buffer, string_->original_length_in_bytes); |
362 | GRN_TEXT_PUTS(ctx, buffer, ")" ); |
363 | |
364 | GRN_TEXT_PUTS(ctx, buffer, " normalized:<" ); |
365 | GRN_TEXT_PUT(ctx, buffer, |
366 | string_->normalized, |
367 | string_->normalized_length_in_bytes); |
368 | GRN_TEXT_PUTS(ctx, buffer, ">" ); |
369 | GRN_TEXT_PUTS(ctx, buffer, "(" ); |
370 | grn_text_itoa(ctx, buffer, string_->normalized_length_in_bytes); |
371 | GRN_TEXT_PUTS(ctx, buffer, ")" ); |
372 | |
373 | GRN_TEXT_PUTS(ctx, buffer, " n_characters:" ); |
374 | grn_text_itoa(ctx, buffer, string_->n_characters); |
375 | |
376 | GRN_TEXT_PUTS(ctx, buffer, " encoding:" ); |
377 | grn_inspect_encoding(ctx, buffer, string_->encoding); |
378 | |
379 | GRN_TEXT_PUTS(ctx, buffer, " flags:" ); |
380 | if (string_->flags & GRN_STRING_REMOVE_BLANK) { |
381 | GRN_TEXT_PUTS(ctx, buffer, "REMOVE_BLANK|" ); |
382 | } |
383 | if (string_->flags & GRN_STRING_WITH_TYPES) { |
384 | GRN_TEXT_PUTS(ctx, buffer, "WITH_TYPES|" ); |
385 | } |
386 | if (string_->flags & GRN_STRING_WITH_CHECKS) { |
387 | GRN_TEXT_PUTS(ctx, buffer, "WITH_CHECKS|" ); |
388 | } |
389 | if (string_->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER) { |
390 | GRN_TEXT_PUTS(ctx, buffer, "REMOVE_TOKENIZED_DELIMITER|" ); |
391 | } |
392 | if (GRN_TEXT_VALUE(buffer)[GRN_TEXT_LEN(buffer) - 1] == '|') { |
393 | grn_bulk_truncate(ctx, buffer, GRN_TEXT_LEN(buffer) - 1); |
394 | } |
395 | |
396 | GRN_TEXT_PUTS(ctx, buffer, ">" ); |
397 | |
398 | return GRN_SUCCESS; |
399 | } |
400 | |
401 | grn_rc |
402 | grn_string_close(grn_ctx *ctx, grn_obj *string) |
403 | { |
404 | grn_rc rc; |
405 | grn_string *string_ = (grn_string *)string; |
406 | if (string_) { |
407 | if (string_->normalized) { GRN_FREE(string_->normalized); } |
408 | if (string_->ctypes) { GRN_FREE(string_->ctypes); } |
409 | if (string_->checks) { GRN_FREE(string_->checks); } |
410 | GRN_FREE(string); |
411 | rc = GRN_SUCCESS; |
412 | } else { |
413 | rc = GRN_INVALID_ARGUMENT; |
414 | } |
415 | return rc; |
416 | } |
417 | |