1/* -*- c-basic-offset: 2 -*- */
2/*
3 Copyright(C) 2009-2012 Brazil
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License version 2.1 as published by the Free Software Foundation.
8
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public
15 License along with this library; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18
19#include "grn.h"
20#include <string.h>
21#include "grn_string.h"
22#include "grn_normalizer.h"
23#include "grn_str.h"
24#include "grn_util.h"
25
26#include <groonga/tokenizer.h>
27
28static grn_string *
29grn_fake_string_open(grn_ctx *ctx, grn_string *string)
30{
31 /* TODO: support GRN_STRING_REMOVE_BLANK flag and ctypes */
32 grn_string *nstr = string;
33 const char *str;
34 unsigned int str_len;
35
36 str = nstr->original;
37 str_len = nstr->original_length_in_bytes;
38
39 if (!(nstr->normalized = GRN_MALLOC(str_len + 1))) {
40 ERR(GRN_NO_MEMORY_AVAILABLE,
41 "[strinig][fake] failed to allocate normalized text space");
42 grn_string_close(ctx, (grn_obj *)nstr);
43 return NULL;
44 }
45
46 if (nstr->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER &&
47 ctx->encoding == GRN_ENC_UTF8) {
48 int char_length;
49 const char *source_current = str;
50 const char *source_end = str + str_len;
51 char *destination = nstr->normalized;
52 unsigned int destination_length = 0;
53 while ((char_length = grn_charlen(ctx, source_current, source_end)) > 0) {
54 if (!grn_tokenizer_is_tokenized_delimiter(ctx,
55 source_current, char_length,
56 ctx->encoding)) {
57 grn_memcpy(destination, source_current, char_length);
58 destination += char_length;
59 destination_length += char_length;
60 }
61 source_current += char_length;
62 }
63 nstr->normalized[destination_length] = '\0';
64 nstr->normalized_length_in_bytes = destination_length;
65 } else {
66 grn_memcpy(nstr->normalized, str, str_len);
67 nstr->normalized[str_len] = '\0';
68 nstr->normalized_length_in_bytes = str_len;
69 }
70
71 if (nstr->flags & GRN_STRING_WITH_CHECKS) {
72 int16_t f = 0;
73 unsigned char c;
74 size_t i;
75 if (!(nstr->checks = (int16_t *) GRN_MALLOC(sizeof(int16_t) * str_len))) {
76 grn_string_close(ctx, (grn_obj *)nstr);
77 ERR(GRN_NO_MEMORY_AVAILABLE,
78 "[strinig][fake] failed to allocate checks space");
79 return NULL;
80 }
81 switch (nstr->encoding) {
82 case GRN_ENC_EUC_JP:
83 for (i = 0; i < str_len; i++) {
84 if (!f) {
85 c = (unsigned char) str[i];
86 f = ((c >= 0xa1U && c <= 0xfeU) || c == 0x8eU ? 2 : (c == 0x8fU ? 3 : 1)
87 );
88 nstr->checks[i] = f;
89 } else {
90 nstr->checks[i] = 0;
91 }
92 f--;
93 }
94 break;
95 case GRN_ENC_SJIS:
96 for (i = 0; i < str_len; i++) {
97 if (!f) {
98 c = (unsigned char) str[i];
99 f = (c >= 0x81U && ((c <= 0x9fU) || (c >= 0xe0U && c <= 0xfcU)) ? 2 : 1);
100 nstr->checks[i] = f;
101 } else {
102 nstr->checks[i] = 0;
103 }
104 f--;
105 }
106 break;
107 case GRN_ENC_UTF8:
108 for (i = 0; i < str_len; i++) {
109 if (!f) {
110 c = (unsigned char) str[i];
111 f = (c & 0x80U ? (c & 0x20U ? (c & 0x10U ? 4 : 3)
112 : 2)
113 : 1);
114 nstr->checks[i] = f;
115 } else {
116 nstr->checks[i] = 0;
117 }
118 f--;
119 }
120 break;
121 default:
122 for (i = 0; i < str_len; i++) {
123 nstr->checks[i] = 1;
124 }
125 break;
126 }
127 }
128 return nstr;
129}
130
131grn_obj *
132grn_string_open_(grn_ctx *ctx, const char *str, unsigned int str_len,
133 grn_obj *normalizer, int flags, grn_encoding encoding)
134{
135 grn_string *string;
136 grn_obj *obj;
137 grn_bool is_normalizer_auto;
138
139 if (!str || !str_len) {
140 return NULL;
141 }
142
143 is_normalizer_auto = (normalizer == GRN_NORMALIZER_AUTO);
144 if (is_normalizer_auto) {
145 normalizer = grn_ctx_get(ctx, GRN_NORMALIZER_AUTO_NAME, -1);
146 if (!normalizer) {
147 ERR(GRN_INVALID_ARGUMENT,
148 "[string][open] NormalizerAuto normalizer isn't available");
149 return NULL;
150 }
151 }
152
153 string = GRN_MALLOCN(grn_string, 1);
154 if (!string) {
155 if (is_normalizer_auto) {
156 grn_obj_unlink(ctx, normalizer);
157 }
158 GRN_LOG(ctx, GRN_LOG_ALERT,
159 "[string][open] failed to allocate memory");
160 return NULL;
161 }
162
163 obj = (grn_obj *)string;
164 GRN_OBJ_INIT(obj, GRN_STRING, GRN_OBJ_ALLOCATED, GRN_ID_NIL);
165 string->original = str;
166 string->original_length_in_bytes = str_len;
167 string->normalized = NULL;
168 string->normalized_length_in_bytes = 0;
169 string->n_characters = 0;
170 string->checks = NULL;
171 string->ctypes = NULL;
172 string->encoding = encoding;
173 string->flags = flags;
174
175 if (!normalizer) {
176 return (grn_obj *)grn_fake_string_open(ctx, string);
177 }
178
179 grn_normalizer_normalize(ctx, normalizer, (grn_obj *)string);
180 if (ctx->rc) {
181 grn_obj_close(ctx, obj);
182 obj = NULL;
183 }
184
185 if (is_normalizer_auto) {
186 grn_obj_unlink(ctx, normalizer);
187 }
188
189 return obj;
190}
191
192grn_obj *
193grn_string_open(grn_ctx *ctx, const char *str, unsigned int str_len,
194 grn_obj *normalizer, int flags)
195{
196 return grn_string_open_(ctx, str, str_len, normalizer, flags, ctx->encoding);
197}
198
199grn_rc
200grn_string_get_original(grn_ctx *ctx, grn_obj *string,
201 const char **original,
202 unsigned int *length_in_bytes)
203{
204 grn_rc rc;
205 grn_string *string_ = (grn_string *)string;
206 GRN_API_ENTER;
207 if (string_) {
208 if (original) { *original = string_->original; }
209 if (length_in_bytes) {
210 *length_in_bytes = string_->original_length_in_bytes;
211 }
212 rc = GRN_SUCCESS;
213 } else {
214 rc = GRN_INVALID_ARGUMENT;
215 }
216 GRN_API_RETURN(rc);
217}
218
219int
220grn_string_get_flags(grn_ctx *ctx, grn_obj *string)
221{
222 int flags = 0;
223 grn_string *string_ = (grn_string *)string;
224 GRN_API_ENTER;
225 if (string_) {
226 flags = string_->flags;
227 }
228 GRN_API_RETURN(flags);
229}
230
231grn_rc
232grn_string_get_normalized(grn_ctx *ctx, grn_obj *string,
233 const char **normalized,
234 unsigned int *length_in_bytes,
235 unsigned int *n_characters)
236{
237 grn_rc rc;
238 grn_string *string_ = (grn_string *)string;
239 GRN_API_ENTER;
240 if (string_) {
241 if (normalized) { *normalized = string_->normalized; }
242 if (length_in_bytes) {
243 *length_in_bytes = string_->normalized_length_in_bytes;
244 }
245 if (n_characters) { *n_characters = string_->n_characters; }
246 rc = GRN_SUCCESS;
247 } else {
248 if (normalized) { *normalized = NULL; }
249 if (length_in_bytes) { *length_in_bytes = 0; }
250 if (n_characters) { *n_characters = 0; }
251 rc = GRN_INVALID_ARGUMENT;
252 }
253 GRN_API_RETURN(rc);
254}
255
256grn_rc
257grn_string_set_normalized(grn_ctx *ctx, grn_obj *string,
258 char *normalized, unsigned int length_in_bytes,
259 unsigned int n_characters)
260{
261 grn_rc rc;
262 grn_string *string_ = (grn_string *)string;
263 GRN_API_ENTER;
264 if (string_) {
265 if (string_->normalized) { GRN_FREE(string_->normalized); }
266 string_->normalized = normalized;
267 string_->normalized_length_in_bytes = length_in_bytes;
268 string_->n_characters = n_characters;
269 rc = GRN_SUCCESS;
270 } else {
271 rc = GRN_INVALID_ARGUMENT;
272 }
273 GRN_API_RETURN(rc);
274}
275
276const short *
277grn_string_get_checks(grn_ctx *ctx, grn_obj *string)
278{
279 int16_t *checks = NULL;
280 grn_string *string_ = (grn_string *)string;
281 GRN_API_ENTER;
282 if (string_) {
283 checks = string_->checks;
284 } else {
285 checks = NULL;
286 }
287 GRN_API_RETURN(checks);
288}
289
290grn_rc
291grn_string_set_checks(grn_ctx *ctx, grn_obj *string, short *checks)
292{
293 grn_rc rc;
294 grn_string *string_ = (grn_string *)string;
295 GRN_API_ENTER;
296 if (string_) {
297 if (string_->checks) { GRN_FREE(string_->checks); }
298 string_->checks = checks;
299 rc = GRN_SUCCESS;
300 } else {
301 rc = GRN_INVALID_ARGUMENT;
302 }
303 GRN_API_RETURN(rc);
304}
305
306const unsigned char *
307grn_string_get_types(grn_ctx *ctx, grn_obj *string)
308{
309 unsigned char *types = NULL;
310 grn_string *string_ = (grn_string *)string;
311 GRN_API_ENTER;
312 if (string_) {
313 types = string_->ctypes;
314 } else {
315 types = NULL;
316 }
317 GRN_API_RETURN(types);
318}
319
320grn_rc
321grn_string_set_types(grn_ctx *ctx, grn_obj *string, unsigned char *types)
322{
323 grn_rc rc;
324 grn_string *string_ = (grn_string *)string;
325 GRN_API_ENTER;
326 if (string_) {
327 if (string_->ctypes) { GRN_FREE(string_->ctypes); }
328 string_->ctypes = types;
329 rc = GRN_SUCCESS;
330 } else {
331 rc = GRN_INVALID_ARGUMENT;
332 }
333 GRN_API_RETURN(rc);
334}
335
336grn_encoding
337grn_string_get_encoding(grn_ctx *ctx, grn_obj *string)
338{
339 grn_encoding encoding = GRN_ENC_NONE;
340 grn_string *string_ = (grn_string *)string;
341 GRN_API_ENTER;
342 if (string_) {
343 encoding = string_->encoding;
344 }
345 GRN_API_RETURN(encoding);
346}
347
348grn_rc
349grn_string_inspect(grn_ctx *ctx, grn_obj *buffer, grn_obj *string)
350{
351 grn_string *string_ = (grn_string *)string;
352
353 GRN_TEXT_PUTS(ctx, buffer, "#<string:");
354
355 GRN_TEXT_PUTS(ctx, buffer, " original:<");
356 GRN_TEXT_PUT(ctx, buffer,
357 string_->original,
358 string_->original_length_in_bytes);
359 GRN_TEXT_PUTS(ctx, buffer, ">");
360 GRN_TEXT_PUTS(ctx, buffer, "(");
361 grn_text_itoa(ctx, buffer, string_->original_length_in_bytes);
362 GRN_TEXT_PUTS(ctx, buffer, ")");
363
364 GRN_TEXT_PUTS(ctx, buffer, " normalized:<");
365 GRN_TEXT_PUT(ctx, buffer,
366 string_->normalized,
367 string_->normalized_length_in_bytes);
368 GRN_TEXT_PUTS(ctx, buffer, ">");
369 GRN_TEXT_PUTS(ctx, buffer, "(");
370 grn_text_itoa(ctx, buffer, string_->normalized_length_in_bytes);
371 GRN_TEXT_PUTS(ctx, buffer, ")");
372
373 GRN_TEXT_PUTS(ctx, buffer, " n_characters:");
374 grn_text_itoa(ctx, buffer, string_->n_characters);
375
376 GRN_TEXT_PUTS(ctx, buffer, " encoding:");
377 grn_inspect_encoding(ctx, buffer, string_->encoding);
378
379 GRN_TEXT_PUTS(ctx, buffer, " flags:");
380 if (string_->flags & GRN_STRING_REMOVE_BLANK) {
381 GRN_TEXT_PUTS(ctx, buffer, "REMOVE_BLANK|");
382 }
383 if (string_->flags & GRN_STRING_WITH_TYPES) {
384 GRN_TEXT_PUTS(ctx, buffer, "WITH_TYPES|");
385 }
386 if (string_->flags & GRN_STRING_WITH_CHECKS) {
387 GRN_TEXT_PUTS(ctx, buffer, "WITH_CHECKS|");
388 }
389 if (string_->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER) {
390 GRN_TEXT_PUTS(ctx, buffer, "REMOVE_TOKENIZED_DELIMITER|");
391 }
392 if (GRN_TEXT_VALUE(buffer)[GRN_TEXT_LEN(buffer) - 1] == '|') {
393 grn_bulk_truncate(ctx, buffer, GRN_TEXT_LEN(buffer) - 1);
394 }
395
396 GRN_TEXT_PUTS(ctx, buffer, ">");
397
398 return GRN_SUCCESS;
399}
400
401grn_rc
402grn_string_close(grn_ctx *ctx, grn_obj *string)
403{
404 grn_rc rc;
405 grn_string *string_ = (grn_string *)string;
406 if (string_) {
407 if (string_->normalized) { GRN_FREE(string_->normalized); }
408 if (string_->ctypes) { GRN_FREE(string_->ctypes); }
409 if (string_->checks) { GRN_FREE(string_->checks); }
410 GRN_FREE(string);
411 rc = GRN_SUCCESS;
412 } else {
413 rc = GRN_INVALID_ARGUMENT;
414 }
415 return rc;
416}
417