| 1 | /* -*- c-basic-offset: 2 -*- */ |
| 2 | /* |
| 3 | Copyright(C) 2009-2012 Brazil |
| 4 | |
| 5 | This library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License version 2.1 as published by the Free Software Foundation. |
| 8 | |
| 9 | This library is distributed in the hope that it will be useful, |
| 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 12 | Lesser General Public License for more details. |
| 13 | |
| 14 | You should have received a copy of the GNU Lesser General Public |
| 15 | License along with this library; if not, write to the Free Software |
| 16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 17 | */ |
| 18 | |
| 19 | #include "grn.h" |
| 20 | #include <string.h> |
| 21 | #include "grn_string.h" |
| 22 | #include "grn_normalizer.h" |
| 23 | #include "grn_str.h" |
| 24 | #include "grn_util.h" |
| 25 | |
| 26 | #include <groonga/tokenizer.h> |
| 27 | |
| 28 | static grn_string * |
| 29 | grn_fake_string_open(grn_ctx *ctx, grn_string *string) |
| 30 | { |
| 31 | /* TODO: support GRN_STRING_REMOVE_BLANK flag and ctypes */ |
| 32 | grn_string *nstr = string; |
| 33 | const char *str; |
| 34 | unsigned int str_len; |
| 35 | |
| 36 | str = nstr->original; |
| 37 | str_len = nstr->original_length_in_bytes; |
| 38 | |
| 39 | if (!(nstr->normalized = GRN_MALLOC(str_len + 1))) { |
| 40 | ERR(GRN_NO_MEMORY_AVAILABLE, |
| 41 | "[strinig][fake] failed to allocate normalized text space" ); |
| 42 | grn_string_close(ctx, (grn_obj *)nstr); |
| 43 | return NULL; |
| 44 | } |
| 45 | |
| 46 | if (nstr->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER && |
| 47 | ctx->encoding == GRN_ENC_UTF8) { |
| 48 | int char_length; |
| 49 | const char *source_current = str; |
| 50 | const char *source_end = str + str_len; |
| 51 | char *destination = nstr->normalized; |
| 52 | unsigned int destination_length = 0; |
| 53 | while ((char_length = grn_charlen(ctx, source_current, source_end)) > 0) { |
| 54 | if (!grn_tokenizer_is_tokenized_delimiter(ctx, |
| 55 | source_current, char_length, |
| 56 | ctx->encoding)) { |
| 57 | grn_memcpy(destination, source_current, char_length); |
| 58 | destination += char_length; |
| 59 | destination_length += char_length; |
| 60 | } |
| 61 | source_current += char_length; |
| 62 | } |
| 63 | nstr->normalized[destination_length] = '\0'; |
| 64 | nstr->normalized_length_in_bytes = destination_length; |
| 65 | } else { |
| 66 | grn_memcpy(nstr->normalized, str, str_len); |
| 67 | nstr->normalized[str_len] = '\0'; |
| 68 | nstr->normalized_length_in_bytes = str_len; |
| 69 | } |
| 70 | |
| 71 | if (nstr->flags & GRN_STRING_WITH_CHECKS) { |
| 72 | int16_t f = 0; |
| 73 | unsigned char c; |
| 74 | size_t i; |
| 75 | if (!(nstr->checks = (int16_t *) GRN_MALLOC(sizeof(int16_t) * str_len))) { |
| 76 | grn_string_close(ctx, (grn_obj *)nstr); |
| 77 | ERR(GRN_NO_MEMORY_AVAILABLE, |
| 78 | "[strinig][fake] failed to allocate checks space" ); |
| 79 | return NULL; |
| 80 | } |
| 81 | switch (nstr->encoding) { |
| 82 | case GRN_ENC_EUC_JP: |
| 83 | for (i = 0; i < str_len; i++) { |
| 84 | if (!f) { |
| 85 | c = (unsigned char) str[i]; |
| 86 | f = ((c >= 0xa1U && c <= 0xfeU) || c == 0x8eU ? 2 : (c == 0x8fU ? 3 : 1) |
| 87 | ); |
| 88 | nstr->checks[i] = f; |
| 89 | } else { |
| 90 | nstr->checks[i] = 0; |
| 91 | } |
| 92 | f--; |
| 93 | } |
| 94 | break; |
| 95 | case GRN_ENC_SJIS: |
| 96 | for (i = 0; i < str_len; i++) { |
| 97 | if (!f) { |
| 98 | c = (unsigned char) str[i]; |
| 99 | f = (c >= 0x81U && ((c <= 0x9fU) || (c >= 0xe0U && c <= 0xfcU)) ? 2 : 1); |
| 100 | nstr->checks[i] = f; |
| 101 | } else { |
| 102 | nstr->checks[i] = 0; |
| 103 | } |
| 104 | f--; |
| 105 | } |
| 106 | break; |
| 107 | case GRN_ENC_UTF8: |
| 108 | for (i = 0; i < str_len; i++) { |
| 109 | if (!f) { |
| 110 | c = (unsigned char) str[i]; |
| 111 | f = (c & 0x80U ? (c & 0x20U ? (c & 0x10U ? 4 : 3) |
| 112 | : 2) |
| 113 | : 1); |
| 114 | nstr->checks[i] = f; |
| 115 | } else { |
| 116 | nstr->checks[i] = 0; |
| 117 | } |
| 118 | f--; |
| 119 | } |
| 120 | break; |
| 121 | default: |
| 122 | for (i = 0; i < str_len; i++) { |
| 123 | nstr->checks[i] = 1; |
| 124 | } |
| 125 | break; |
| 126 | } |
| 127 | } |
| 128 | return nstr; |
| 129 | } |
| 130 | |
| 131 | grn_obj * |
| 132 | grn_string_open_(grn_ctx *ctx, const char *str, unsigned int str_len, |
| 133 | grn_obj *normalizer, int flags, grn_encoding encoding) |
| 134 | { |
| 135 | grn_string *string; |
| 136 | grn_obj *obj; |
| 137 | grn_bool is_normalizer_auto; |
| 138 | |
| 139 | if (!str || !str_len) { |
| 140 | return NULL; |
| 141 | } |
| 142 | |
| 143 | is_normalizer_auto = (normalizer == GRN_NORMALIZER_AUTO); |
| 144 | if (is_normalizer_auto) { |
| 145 | normalizer = grn_ctx_get(ctx, GRN_NORMALIZER_AUTO_NAME, -1); |
| 146 | if (!normalizer) { |
| 147 | ERR(GRN_INVALID_ARGUMENT, |
| 148 | "[string][open] NormalizerAuto normalizer isn't available" ); |
| 149 | return NULL; |
| 150 | } |
| 151 | } |
| 152 | |
| 153 | string = GRN_MALLOCN(grn_string, 1); |
| 154 | if (!string) { |
| 155 | if (is_normalizer_auto) { |
| 156 | grn_obj_unlink(ctx, normalizer); |
| 157 | } |
| 158 | GRN_LOG(ctx, GRN_LOG_ALERT, |
| 159 | "[string][open] failed to allocate memory" ); |
| 160 | return NULL; |
| 161 | } |
| 162 | |
| 163 | obj = (grn_obj *)string; |
| 164 | GRN_OBJ_INIT(obj, GRN_STRING, GRN_OBJ_ALLOCATED, GRN_ID_NIL); |
| 165 | string->original = str; |
| 166 | string->original_length_in_bytes = str_len; |
| 167 | string->normalized = NULL; |
| 168 | string->normalized_length_in_bytes = 0; |
| 169 | string->n_characters = 0; |
| 170 | string->checks = NULL; |
| 171 | string->ctypes = NULL; |
| 172 | string->encoding = encoding; |
| 173 | string->flags = flags; |
| 174 | |
| 175 | if (!normalizer) { |
| 176 | return (grn_obj *)grn_fake_string_open(ctx, string); |
| 177 | } |
| 178 | |
| 179 | grn_normalizer_normalize(ctx, normalizer, (grn_obj *)string); |
| 180 | if (ctx->rc) { |
| 181 | grn_obj_close(ctx, obj); |
| 182 | obj = NULL; |
| 183 | } |
| 184 | |
| 185 | if (is_normalizer_auto) { |
| 186 | grn_obj_unlink(ctx, normalizer); |
| 187 | } |
| 188 | |
| 189 | return obj; |
| 190 | } |
| 191 | |
| 192 | grn_obj * |
| 193 | grn_string_open(grn_ctx *ctx, const char *str, unsigned int str_len, |
| 194 | grn_obj *normalizer, int flags) |
| 195 | { |
| 196 | return grn_string_open_(ctx, str, str_len, normalizer, flags, ctx->encoding); |
| 197 | } |
| 198 | |
| 199 | grn_rc |
| 200 | grn_string_get_original(grn_ctx *ctx, grn_obj *string, |
| 201 | const char **original, |
| 202 | unsigned int *length_in_bytes) |
| 203 | { |
| 204 | grn_rc rc; |
| 205 | grn_string *string_ = (grn_string *)string; |
| 206 | GRN_API_ENTER; |
| 207 | if (string_) { |
| 208 | if (original) { *original = string_->original; } |
| 209 | if (length_in_bytes) { |
| 210 | *length_in_bytes = string_->original_length_in_bytes; |
| 211 | } |
| 212 | rc = GRN_SUCCESS; |
| 213 | } else { |
| 214 | rc = GRN_INVALID_ARGUMENT; |
| 215 | } |
| 216 | GRN_API_RETURN(rc); |
| 217 | } |
| 218 | |
| 219 | int |
| 220 | grn_string_get_flags(grn_ctx *ctx, grn_obj *string) |
| 221 | { |
| 222 | int flags = 0; |
| 223 | grn_string *string_ = (grn_string *)string; |
| 224 | GRN_API_ENTER; |
| 225 | if (string_) { |
| 226 | flags = string_->flags; |
| 227 | } |
| 228 | GRN_API_RETURN(flags); |
| 229 | } |
| 230 | |
| 231 | grn_rc |
| 232 | grn_string_get_normalized(grn_ctx *ctx, grn_obj *string, |
| 233 | const char **normalized, |
| 234 | unsigned int *length_in_bytes, |
| 235 | unsigned int *n_characters) |
| 236 | { |
| 237 | grn_rc rc; |
| 238 | grn_string *string_ = (grn_string *)string; |
| 239 | GRN_API_ENTER; |
| 240 | if (string_) { |
| 241 | if (normalized) { *normalized = string_->normalized; } |
| 242 | if (length_in_bytes) { |
| 243 | *length_in_bytes = string_->normalized_length_in_bytes; |
| 244 | } |
| 245 | if (n_characters) { *n_characters = string_->n_characters; } |
| 246 | rc = GRN_SUCCESS; |
| 247 | } else { |
| 248 | if (normalized) { *normalized = NULL; } |
| 249 | if (length_in_bytes) { *length_in_bytes = 0; } |
| 250 | if (n_characters) { *n_characters = 0; } |
| 251 | rc = GRN_INVALID_ARGUMENT; |
| 252 | } |
| 253 | GRN_API_RETURN(rc); |
| 254 | } |
| 255 | |
| 256 | grn_rc |
| 257 | grn_string_set_normalized(grn_ctx *ctx, grn_obj *string, |
| 258 | char *normalized, unsigned int length_in_bytes, |
| 259 | unsigned int n_characters) |
| 260 | { |
| 261 | grn_rc rc; |
| 262 | grn_string *string_ = (grn_string *)string; |
| 263 | GRN_API_ENTER; |
| 264 | if (string_) { |
| 265 | if (string_->normalized) { GRN_FREE(string_->normalized); } |
| 266 | string_->normalized = normalized; |
| 267 | string_->normalized_length_in_bytes = length_in_bytes; |
| 268 | string_->n_characters = n_characters; |
| 269 | rc = GRN_SUCCESS; |
| 270 | } else { |
| 271 | rc = GRN_INVALID_ARGUMENT; |
| 272 | } |
| 273 | GRN_API_RETURN(rc); |
| 274 | } |
| 275 | |
| 276 | const short * |
| 277 | grn_string_get_checks(grn_ctx *ctx, grn_obj *string) |
| 278 | { |
| 279 | int16_t *checks = NULL; |
| 280 | grn_string *string_ = (grn_string *)string; |
| 281 | GRN_API_ENTER; |
| 282 | if (string_) { |
| 283 | checks = string_->checks; |
| 284 | } else { |
| 285 | checks = NULL; |
| 286 | } |
| 287 | GRN_API_RETURN(checks); |
| 288 | } |
| 289 | |
| 290 | grn_rc |
| 291 | grn_string_set_checks(grn_ctx *ctx, grn_obj *string, short *checks) |
| 292 | { |
| 293 | grn_rc rc; |
| 294 | grn_string *string_ = (grn_string *)string; |
| 295 | GRN_API_ENTER; |
| 296 | if (string_) { |
| 297 | if (string_->checks) { GRN_FREE(string_->checks); } |
| 298 | string_->checks = checks; |
| 299 | rc = GRN_SUCCESS; |
| 300 | } else { |
| 301 | rc = GRN_INVALID_ARGUMENT; |
| 302 | } |
| 303 | GRN_API_RETURN(rc); |
| 304 | } |
| 305 | |
| 306 | const unsigned char * |
| 307 | grn_string_get_types(grn_ctx *ctx, grn_obj *string) |
| 308 | { |
| 309 | unsigned char *types = NULL; |
| 310 | grn_string *string_ = (grn_string *)string; |
| 311 | GRN_API_ENTER; |
| 312 | if (string_) { |
| 313 | types = string_->ctypes; |
| 314 | } else { |
| 315 | types = NULL; |
| 316 | } |
| 317 | GRN_API_RETURN(types); |
| 318 | } |
| 319 | |
| 320 | grn_rc |
| 321 | grn_string_set_types(grn_ctx *ctx, grn_obj *string, unsigned char *types) |
| 322 | { |
| 323 | grn_rc rc; |
| 324 | grn_string *string_ = (grn_string *)string; |
| 325 | GRN_API_ENTER; |
| 326 | if (string_) { |
| 327 | if (string_->ctypes) { GRN_FREE(string_->ctypes); } |
| 328 | string_->ctypes = types; |
| 329 | rc = GRN_SUCCESS; |
| 330 | } else { |
| 331 | rc = GRN_INVALID_ARGUMENT; |
| 332 | } |
| 333 | GRN_API_RETURN(rc); |
| 334 | } |
| 335 | |
| 336 | grn_encoding |
| 337 | grn_string_get_encoding(grn_ctx *ctx, grn_obj *string) |
| 338 | { |
| 339 | grn_encoding encoding = GRN_ENC_NONE; |
| 340 | grn_string *string_ = (grn_string *)string; |
| 341 | GRN_API_ENTER; |
| 342 | if (string_) { |
| 343 | encoding = string_->encoding; |
| 344 | } |
| 345 | GRN_API_RETURN(encoding); |
| 346 | } |
| 347 | |
| 348 | grn_rc |
| 349 | grn_string_inspect(grn_ctx *ctx, grn_obj *buffer, grn_obj *string) |
| 350 | { |
| 351 | grn_string *string_ = (grn_string *)string; |
| 352 | |
| 353 | GRN_TEXT_PUTS(ctx, buffer, "#<string:" ); |
| 354 | |
| 355 | GRN_TEXT_PUTS(ctx, buffer, " original:<" ); |
| 356 | GRN_TEXT_PUT(ctx, buffer, |
| 357 | string_->original, |
| 358 | string_->original_length_in_bytes); |
| 359 | GRN_TEXT_PUTS(ctx, buffer, ">" ); |
| 360 | GRN_TEXT_PUTS(ctx, buffer, "(" ); |
| 361 | grn_text_itoa(ctx, buffer, string_->original_length_in_bytes); |
| 362 | GRN_TEXT_PUTS(ctx, buffer, ")" ); |
| 363 | |
| 364 | GRN_TEXT_PUTS(ctx, buffer, " normalized:<" ); |
| 365 | GRN_TEXT_PUT(ctx, buffer, |
| 366 | string_->normalized, |
| 367 | string_->normalized_length_in_bytes); |
| 368 | GRN_TEXT_PUTS(ctx, buffer, ">" ); |
| 369 | GRN_TEXT_PUTS(ctx, buffer, "(" ); |
| 370 | grn_text_itoa(ctx, buffer, string_->normalized_length_in_bytes); |
| 371 | GRN_TEXT_PUTS(ctx, buffer, ")" ); |
| 372 | |
| 373 | GRN_TEXT_PUTS(ctx, buffer, " n_characters:" ); |
| 374 | grn_text_itoa(ctx, buffer, string_->n_characters); |
| 375 | |
| 376 | GRN_TEXT_PUTS(ctx, buffer, " encoding:" ); |
| 377 | grn_inspect_encoding(ctx, buffer, string_->encoding); |
| 378 | |
| 379 | GRN_TEXT_PUTS(ctx, buffer, " flags:" ); |
| 380 | if (string_->flags & GRN_STRING_REMOVE_BLANK) { |
| 381 | GRN_TEXT_PUTS(ctx, buffer, "REMOVE_BLANK|" ); |
| 382 | } |
| 383 | if (string_->flags & GRN_STRING_WITH_TYPES) { |
| 384 | GRN_TEXT_PUTS(ctx, buffer, "WITH_TYPES|" ); |
| 385 | } |
| 386 | if (string_->flags & GRN_STRING_WITH_CHECKS) { |
| 387 | GRN_TEXT_PUTS(ctx, buffer, "WITH_CHECKS|" ); |
| 388 | } |
| 389 | if (string_->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER) { |
| 390 | GRN_TEXT_PUTS(ctx, buffer, "REMOVE_TOKENIZED_DELIMITER|" ); |
| 391 | } |
| 392 | if (GRN_TEXT_VALUE(buffer)[GRN_TEXT_LEN(buffer) - 1] == '|') { |
| 393 | grn_bulk_truncate(ctx, buffer, GRN_TEXT_LEN(buffer) - 1); |
| 394 | } |
| 395 | |
| 396 | GRN_TEXT_PUTS(ctx, buffer, ">" ); |
| 397 | |
| 398 | return GRN_SUCCESS; |
| 399 | } |
| 400 | |
| 401 | grn_rc |
| 402 | grn_string_close(grn_ctx *ctx, grn_obj *string) |
| 403 | { |
| 404 | grn_rc rc; |
| 405 | grn_string *string_ = (grn_string *)string; |
| 406 | if (string_) { |
| 407 | if (string_->normalized) { GRN_FREE(string_->normalized); } |
| 408 | if (string_->ctypes) { GRN_FREE(string_->ctypes); } |
| 409 | if (string_->checks) { GRN_FREE(string_->checks); } |
| 410 | GRN_FREE(string); |
| 411 | rc = GRN_SUCCESS; |
| 412 | } else { |
| 413 | rc = GRN_INVALID_ARGUMENT; |
| 414 | } |
| 415 | return rc; |
| 416 | } |
| 417 | |