| 1 | /* -*- c-basic-offset: 2 -*- */ | 
| 2 | /* | 
| 3 |   Copyright(C) 2012 Brazil | 
| 4 |  | 
| 5 |   This library is free software; you can redistribute it and/or | 
| 6 |   modify it under the terms of the GNU Lesser General Public | 
| 7 |   License version 2.1 as published by the Free Software Foundation. | 
| 8 |  | 
| 9 |   This library is distributed in the hope that it will be useful, | 
| 10 |   but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 11 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
| 12 |   Lesser General Public License for more details. | 
| 13 |  | 
| 14 |   You should have received a copy of the GNU Lesser General Public | 
| 15 |   License along with this library; if not, write to the Free Software | 
| 16 |   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA | 
| 17 | */ | 
| 18 |  | 
| 19 | #include <string.h> | 
| 20 |  | 
| 21 | #include "grn_normalizer.h" | 
| 22 | #include "grn_string.h" | 
| 23 | #include "grn_nfkc.h" | 
| 24 | #include <groonga/normalizer.h> | 
| 25 | #include <groonga/tokenizer.h> | 
| 26 |  | 
| 27 | grn_rc | 
| 28 | grn_normalizer_register(grn_ctx *ctx, | 
| 29 |                         const char *name_ptr, | 
| 30 |                         int name_length, | 
| 31 |                         grn_proc_func *init, | 
| 32 |                         grn_proc_func *next, | 
| 33 |                         grn_proc_func *fin) | 
| 34 | { | 
| 35 |   grn_expr_var vars[] = { | 
| 36 |     { NULL, 0 } | 
| 37 |   }; | 
| 38 |   GRN_PTR_INIT(&vars[0].value, 0, GRN_ID_NIL); | 
| 39 |  | 
| 40 |   if (name_length < 0) { | 
| 41 |     name_length = strlen(name_ptr); | 
| 42 |   } | 
| 43 |  | 
| 44 |   { | 
| 45 |     grn_obj * const normalizer = grn_proc_create(ctx, | 
| 46 |                                                  name_ptr, name_length, | 
| 47 |                                                  GRN_PROC_NORMALIZER, | 
| 48 |                                                  init, next, fin, | 
| 49 |                                                  sizeof(*vars) / sizeof(vars), | 
| 50 |                                                  vars); | 
| 51 |     if (!normalizer) { | 
| 52 |       GRN_PLUGIN_ERROR(ctx, GRN_NORMALIZER_ERROR, | 
| 53 |                        "[normalizer] failed to register normalizer: <%.*s>" , | 
| 54 |                        name_length, name_ptr); | 
| 55 |       return ctx->rc; | 
| 56 |     } | 
| 57 |   } | 
| 58 |   return GRN_SUCCESS; | 
| 59 | } | 
| 60 |  | 
| 61 | grn_rc | 
| 62 | grn_normalizer_init(void) | 
| 63 | { | 
| 64 |   return GRN_SUCCESS; | 
| 65 | } | 
| 66 |  | 
| 67 | grn_rc | 
| 68 | grn_normalizer_fin(void) | 
| 69 | { | 
| 70 |   return GRN_SUCCESS; | 
| 71 | } | 
| 72 |  | 
| 73 | static unsigned char symbol[] = { | 
| 74 |   ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0, | 
| 75 |   0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0, | 
| 76 |   '"', '(', ')', 0, 0, '[', ']', '{', '}', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
| 77 |   '+', '-', 0, 0, 0, '=', 0, '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
| 78 |   '$', 0, 0, '%', '#', '&', '*', '@', 0, 0, 0, 0, 0, 0, 0, 0 | 
| 79 | }; | 
| 80 |  | 
| 81 | inline static grn_obj * | 
| 82 | eucjp_normalize(grn_ctx *ctx, grn_string *nstr) | 
| 83 | { | 
| 84 |   static uint16_t hankana[] = { | 
| 85 |     0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3, | 
| 86 |     0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3, 0xa1bc, 0xa5a2, | 
| 87 |     0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad, 0xa5af, 0xa5b1, 0xa5b3, | 
| 88 |     0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd, 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6, | 
| 89 |     0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc, 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5, | 
| 90 |     0xa5d8, 0xa5db, 0xa5de, 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6, | 
| 91 |     0xa5e8, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab, | 
| 92 |     0xa1eb | 
| 93 |   }; | 
| 94 |   static unsigned char dakuten[] = { | 
| 95 |     0xf4, 0, 0, 0, 0, 0xac, 0, 0xae, 0, 0xb0, 0, 0xb2, 0, 0xb4, 0, 0xb6, 0, | 
| 96 |     0xb8, 0, 0xba, 0, 0xbc, 0, 0xbe, 0, 0xc0, 0, 0xc2, 0, 0, 0xc5, 0, 0xc7, | 
| 97 |     0, 0xc9, 0, 0, 0, 0, 0, 0, 0xd0, 0, 0, 0xd3, 0, 0, 0xd6, 0, 0, 0xd9, 0, | 
| 98 |     0, 0xdc | 
| 99 |   }; | 
| 100 |   static unsigned char handaku[] = { | 
| 101 |     0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd | 
| 102 |   }; | 
| 103 |   int16_t *ch; | 
| 104 |   const unsigned char *s, *s_, *e; | 
| 105 |   unsigned char *d, *d0, *d_, b; | 
| 106 |   uint_least8_t *cp, *ctypes, ctype; | 
| 107 |   size_t size = nstr->original_length_in_bytes, length = 0; | 
| 108 |   int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK; | 
| 109 |   if (!(nstr->normalized = GRN_MALLOC(size * 2 + 1))) { | 
| 110 |     ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 111 |         "[string][eucjp] failed to allocate normalized text space" ); | 
| 112 |     return NULL; | 
| 113 |   } | 
| 114 |   d0 = (unsigned char *) nstr->normalized; | 
| 115 |   if (nstr->flags & GRN_STRING_WITH_CHECKS) { | 
| 116 |     if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) { | 
| 117 |       GRN_FREE(nstr->normalized); | 
| 118 |       nstr->normalized = NULL; | 
| 119 |       ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 120 |           "[string][eucjp] failed to allocate checks space" ); | 
| 121 |       return NULL; | 
| 122 |     } | 
| 123 |   } | 
| 124 |   ch = nstr->checks; | 
| 125 |   if (nstr->flags & GRN_STRING_WITH_TYPES) { | 
| 126 |     if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { | 
| 127 |       GRN_FREE(nstr->checks); | 
| 128 |       GRN_FREE(nstr->normalized); | 
| 129 |       nstr->checks = NULL; | 
| 130 |       nstr->normalized = NULL; | 
| 131 |       ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 132 |           "[string][eucjp] failed to allocate character types space" ); | 
| 133 |       return NULL; | 
| 134 |     } | 
| 135 |   } | 
| 136 |   cp = ctypes = nstr->ctypes; | 
| 137 |   e = (unsigned char *)nstr->original + size; | 
| 138 |   for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) { | 
| 139 |     if ((*s & 0x80)) { | 
| 140 |       if (((s + 1) < e) && (*(s + 1) & 0x80)) { | 
| 141 |         unsigned char c1 = *s++, c2 = *s, c3 = 0; | 
| 142 |         switch (c1 >> 4) { | 
| 143 |         case 0x08 : | 
| 144 |           if (c1 == 0x8e && 0xa0 <= c2 && c2 <= 0xdf) { | 
| 145 |             uint16_t c = hankana[c2 - 0xa0]; | 
| 146 |             switch (c) { | 
| 147 |             case 0xa1ab : | 
| 148 |               if (d > d0 + 1 && d[-2] == 0xa5 | 
| 149 |                   && 0xa6 <= d[-1] && d[-1] <= 0xdb && (b = dakuten[d[-1] - 0xa6])) { | 
| 150 |                 *(d - 1) = b; | 
| 151 |                 if (ch) { ch[-1] += 2; s_ += 2; } | 
| 152 |                 continue; | 
| 153 |               } else { | 
| 154 |                 *d++ = c >> 8; *d = c & 0xff; | 
| 155 |               } | 
| 156 |               break; | 
| 157 |             case 0xa1eb : | 
| 158 |               if (d > d0 + 1 && d[-2] == 0xa5 | 
| 159 |                   && 0xcf <= d[-1] && d[-1] <= 0xdb && (b = handaku[d[-1] - 0xcf])) { | 
| 160 |                 *(d - 1) = b; | 
| 161 |                 if (ch) { ch[-1] += 2; s_ += 2; } | 
| 162 |                 continue; | 
| 163 |               } else { | 
| 164 |                 *d++ = c >> 8; *d = c & 0xff; | 
| 165 |               } | 
| 166 |               break; | 
| 167 |             default : | 
| 168 |               *d++ = c >> 8; *d = c & 0xff; | 
| 169 |               break; | 
| 170 |             } | 
| 171 |             ctype = GRN_CHAR_KATAKANA; | 
| 172 |           } else { | 
| 173 |             *d++ = c1; *d = c2; | 
| 174 |             ctype = GRN_CHAR_OTHERS; | 
| 175 |           } | 
| 176 |           break; | 
| 177 |         case 0x09 : | 
| 178 |           *d++ = c1; *d = c2; | 
| 179 |           ctype = GRN_CHAR_OTHERS; | 
| 180 |           break; | 
| 181 |         case 0x0a : | 
| 182 |           switch (c1 & 0x0f) { | 
| 183 |           case 1 : | 
| 184 |             switch (c2) { | 
| 185 |             case 0xbc : | 
| 186 |               *d++ = c1; *d = c2; | 
| 187 |               ctype = GRN_CHAR_KATAKANA; | 
| 188 |               break; | 
| 189 |             case 0xb9 : | 
| 190 |               *d++ = c1; *d = c2; | 
| 191 |               ctype = GRN_CHAR_KANJI; | 
| 192 |               break; | 
| 193 |             case 0xa1 : | 
| 194 |               if (removeblankp) { | 
| 195 |                 if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } | 
| 196 |                 continue; | 
| 197 |               } else { | 
| 198 |                 *d = ' '; | 
| 199 |                 ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL; | 
| 200 |               } | 
| 201 |               break; | 
| 202 |             default : | 
| 203 |               if (c2 >= 0xa4 && (c3 = symbol[c2 - 0xa4])) { | 
| 204 |                 *d = c3; | 
| 205 |                 ctype = GRN_CHAR_SYMBOL; | 
| 206 |               } else { | 
| 207 |                 *d++ = c1; *d = c2; | 
| 208 |                 ctype = GRN_CHAR_OTHERS; | 
| 209 |               } | 
| 210 |               break; | 
| 211 |             } | 
| 212 |             break; | 
| 213 |           case 2 : | 
| 214 |             *d++ = c1; *d = c2; | 
| 215 |             ctype = GRN_CHAR_SYMBOL; | 
| 216 |             break; | 
| 217 |           case 3 : | 
| 218 |             c3 = c2 - 0x80; | 
| 219 |             if ('a' <= c3 && c3 <= 'z') { | 
| 220 |               ctype = GRN_CHAR_ALPHA; | 
| 221 |               *d = c3; | 
| 222 |             } else if ('A' <= c3 && c3 <= 'Z') { | 
| 223 |               ctype = GRN_CHAR_ALPHA; | 
| 224 |               *d = c3 + 0x20; | 
| 225 |             } else if ('0' <= c3 && c3 <= '9') { | 
| 226 |               ctype = GRN_CHAR_DIGIT; | 
| 227 |               *d = c3; | 
| 228 |             } else { | 
| 229 |               ctype = GRN_CHAR_OTHERS; | 
| 230 |               *d++ = c1; *d = c2; | 
| 231 |             } | 
| 232 |             break; | 
| 233 |           case 4 : | 
| 234 |             *d++ = c1; *d = c2; | 
| 235 |             ctype = GRN_CHAR_HIRAGANA; | 
| 236 |             break; | 
| 237 |           case 5 : | 
| 238 |             *d++ = c1; *d = c2; | 
| 239 |             ctype = GRN_CHAR_KATAKANA; | 
| 240 |             break; | 
| 241 |           case 6 : | 
| 242 |           case 7 : | 
| 243 |           case 8 : | 
| 244 |             *d++ = c1; *d = c2; | 
| 245 |             ctype = GRN_CHAR_SYMBOL; | 
| 246 |             break; | 
| 247 |           default : | 
| 248 |             *d++ = c1; *d = c2; | 
| 249 |             ctype = GRN_CHAR_OTHERS; | 
| 250 |             break; | 
| 251 |           } | 
| 252 |           break; | 
| 253 |         default : | 
| 254 |           *d++ = c1; *d = c2; | 
| 255 |           ctype = GRN_CHAR_KANJI; | 
| 256 |           break; | 
| 257 |         } | 
| 258 |       } else { | 
| 259 |         /* skip invalid character */ | 
| 260 |         continue; | 
| 261 |       } | 
| 262 |     } else { | 
| 263 |       unsigned char c = *s; | 
| 264 |       switch (c >> 4) { | 
| 265 |       case 0 : | 
| 266 |       case 1 : | 
| 267 |         /* skip unprintable ascii */ | 
| 268 |         if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } | 
| 269 |         continue; | 
| 270 |       case 2 : | 
| 271 |         if (c == 0x20) { | 
| 272 |           if (removeblankp) { | 
| 273 |             if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } | 
| 274 |             continue; | 
| 275 |           } else { | 
| 276 |             *d = ' '; | 
| 277 |             ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL; | 
| 278 |           } | 
| 279 |         } else { | 
| 280 |           *d = c; | 
| 281 |           ctype = GRN_CHAR_SYMBOL; | 
| 282 |         } | 
| 283 |         break; | 
| 284 |       case 3 : | 
| 285 |         *d = c; | 
| 286 |         ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL; | 
| 287 |         break; | 
| 288 |       case 4 : | 
| 289 |         *d = ('A' <= c) ? c + 0x20 : c; | 
| 290 |         ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA; | 
| 291 |         break; | 
| 292 |       case 5 : | 
| 293 |         *d = (c <= 'Z') ? c + 0x20 : c; | 
| 294 |         ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL; | 
| 295 |         break; | 
| 296 |       case 6 : | 
| 297 |         *d = c; | 
| 298 |         ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA; | 
| 299 |         break; | 
| 300 |       case 7 : | 
| 301 |         *d = c; | 
| 302 |         ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL); | 
| 303 |         break; | 
| 304 |       default : | 
| 305 |         *d = c; | 
| 306 |         ctype = GRN_CHAR_OTHERS; | 
| 307 |         break; | 
| 308 |       } | 
| 309 |     } | 
| 310 |     d++; | 
| 311 |     length++; | 
| 312 |     if (cp) { *cp++ = ctype; } | 
| 313 |     if (ch) { | 
| 314 |       *ch++ = (int16_t)(s + 1 - s_); | 
| 315 |       s_ = s + 1; | 
| 316 |       while (++d_ < d) { *ch++ = 0; } | 
| 317 |     } | 
| 318 |   } | 
| 319 |   if (cp) { *cp = GRN_CHAR_NULL; } | 
| 320 |   *d = '\0'; | 
| 321 |   nstr->n_characters = length; | 
| 322 |   nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized); | 
| 323 |   return NULL; | 
| 324 | } | 
| 325 |  | 
| 326 | inline static grn_obj * | 
| 327 | sjis_normalize(grn_ctx *ctx, grn_string *nstr) | 
| 328 | { | 
| 329 |   static uint16_t hankana[] = { | 
| 330 |     0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342, | 
| 331 |     0x8344, 0x8346, 0x8348, 0x8383, 0x8385, 0x8387, 0x8362, 0x815b, 0x8341, | 
| 332 |     0x8343, 0x8345, 0x8347, 0x8349, 0x834a, 0x834c, 0x834e, 0x8350, 0x8352, | 
| 333 |     0x8354, 0x8356, 0x8358, 0x835a, 0x835c, 0x835e, 0x8360, 0x8363, 0x8365, | 
| 334 |     0x8367, 0x8369, 0x836a, 0x836b, 0x836c, 0x836d, 0x836e, 0x8371, 0x8374, | 
| 335 |     0x8377, 0x837a, 0x837d, 0x837e, 0x8380, 0x8381, 0x8382, 0x8384, 0x8386, | 
| 336 |     0x8388, 0x8389, 0x838a, 0x838b, 0x838c, 0x838d, 0x838f, 0x8393, 0x814a, | 
| 337 |     0x814b | 
| 338 |   }; | 
| 339 |   static unsigned char dakuten[] = { | 
| 340 |     0x94, 0, 0, 0, 0, 0x4b, 0, 0x4d, 0, 0x4f, 0, 0x51, 0, 0x53, 0, 0x55, 0, | 
| 341 |     0x57, 0, 0x59, 0, 0x5b, 0, 0x5d, 0, 0x5f, 0, 0x61, 0, 0, 0x64, 0, 0x66, | 
| 342 |     0, 0x68, 0, 0, 0, 0, 0, 0, 0x6f, 0, 0, 0x72, 0, 0, 0x75, 0, 0, 0x78, 0, | 
| 343 |     0, 0x7b | 
| 344 |   }; | 
| 345 |   static unsigned char handaku[] = { | 
| 346 |     0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c | 
| 347 |   }; | 
| 348 |   int16_t *ch; | 
| 349 |   const unsigned char *s, *s_; | 
| 350 |   unsigned char *d, *d0, *d_, b, *e; | 
| 351 |   uint_least8_t *cp, *ctypes, ctype; | 
| 352 |   size_t size = nstr->original_length_in_bytes, length = 0; | 
| 353 |   int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK; | 
| 354 |   if (!(nstr->normalized = GRN_MALLOC(size * 2 + 1))) { | 
| 355 |     ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 356 |         "[string][sjis] failed to allocate normalized text space" ); | 
| 357 |     return NULL; | 
| 358 |   } | 
| 359 |   d0 = (unsigned char *) nstr->normalized; | 
| 360 |   if (nstr->flags & GRN_STRING_WITH_CHECKS) { | 
| 361 |     if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) { | 
| 362 |       GRN_FREE(nstr->normalized); | 
| 363 |       nstr->normalized = NULL; | 
| 364 |       ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 365 |           "[string][sjis] failed to allocate checks space" ); | 
| 366 |       return NULL; | 
| 367 |     } | 
| 368 |   } | 
| 369 |   ch = nstr->checks; | 
| 370 |   if (nstr->flags & GRN_STRING_WITH_TYPES) { | 
| 371 |     if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { | 
| 372 |       GRN_FREE(nstr->checks); | 
| 373 |       GRN_FREE(nstr->normalized); | 
| 374 |       nstr->checks = NULL; | 
| 375 |       nstr->normalized = NULL; | 
| 376 |       ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 377 |           "[string][sjis] failed to allocate character types space" ); | 
| 378 |       return NULL; | 
| 379 |     } | 
| 380 |   } | 
| 381 |   cp = ctypes = nstr->ctypes; | 
| 382 |   e = (unsigned char *)nstr->original + size; | 
| 383 |   for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) { | 
| 384 |     if ((*s & 0x80)) { | 
| 385 |       if (0xa0 <= *s && *s <= 0xdf) { | 
| 386 |         uint16_t c = hankana[*s - 0xa0]; | 
| 387 |         switch (c) { | 
| 388 |         case 0x814a : | 
| 389 |           if (d > d0 + 1 && d[-2] == 0x83 | 
| 390 |               && 0x45 <= d[-1] && d[-1] <= 0x7a && (b = dakuten[d[-1] - 0x45])) { | 
| 391 |             *(d - 1) = b; | 
| 392 |             if (ch) { ch[-1]++; s_++; } | 
| 393 |             continue; | 
| 394 |           } else { | 
| 395 |             *d++ = c >> 8; *d = c & 0xff; | 
| 396 |           } | 
| 397 |           break; | 
| 398 |         case 0x814b : | 
| 399 |           if (d > d0 + 1 && d[-2] == 0x83 | 
| 400 |               && 0x6e <= d[-1] && d[-1] <= 0x7a && (b = handaku[d[-1] - 0x6e])) { | 
| 401 |             *(d - 1) = b; | 
| 402 |             if (ch) { ch[-1]++; s_++; } | 
| 403 |             continue; | 
| 404 |           } else { | 
| 405 |             *d++ = c >> 8; *d = c & 0xff; | 
| 406 |           } | 
| 407 |           break; | 
| 408 |         default : | 
| 409 |           *d++ = c >> 8; *d = c & 0xff; | 
| 410 |           break; | 
| 411 |         } | 
| 412 |         ctype = GRN_CHAR_KATAKANA; | 
| 413 |       } else { | 
| 414 |         if ((s + 1) < e && 0x40 <= *(s + 1) && *(s + 1) <= 0xfc) { | 
| 415 |           unsigned char c1 = *s++, c2 = *s, c3 = 0; | 
| 416 |           if (0x81 <= c1 && c1 <= 0x87) { | 
| 417 |             switch (c1 & 0x0f) { | 
| 418 |             case 1 : | 
| 419 |               switch (c2) { | 
| 420 |               case 0x5b : | 
| 421 |                 *d++ = c1; *d = c2; | 
| 422 |                 ctype = GRN_CHAR_KATAKANA; | 
| 423 |                 break; | 
| 424 |               case 0x58 : | 
| 425 |                 *d++ = c1; *d = c2; | 
| 426 |                 ctype = GRN_CHAR_KANJI; | 
| 427 |                 break; | 
| 428 |               case 0x40 : | 
| 429 |                 if (removeblankp) { | 
| 430 |                   if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } | 
| 431 |                   continue; | 
| 432 |                 } else { | 
| 433 |                   *d = ' '; | 
| 434 |                   ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL; | 
| 435 |                 } | 
| 436 |                 break; | 
| 437 |               default : | 
| 438 |                 if (0x43 <= c2 && c2 <= 0x7e && (c3 = symbol[c2 - 0x43])) { | 
| 439 |                   *d = c3; | 
| 440 |                   ctype = GRN_CHAR_SYMBOL; | 
| 441 |                 } else if (0x7f <= c2 && c2 <= 0x97 && (c3 = symbol[c2 - 0x44])) { | 
| 442 |                   *d = c3; | 
| 443 |                   ctype = GRN_CHAR_SYMBOL; | 
| 444 |                 } else { | 
| 445 |                   *d++ = c1; *d = c2; | 
| 446 |                   ctype = GRN_CHAR_OTHERS; | 
| 447 |                 } | 
| 448 |                 break; | 
| 449 |               } | 
| 450 |               break; | 
| 451 |             case 2 : | 
| 452 |               c3 = c2 - 0x1f; | 
| 453 |               if (0x4f <= c2 && c2 <= 0x58) { | 
| 454 |                 ctype = GRN_CHAR_DIGIT; | 
| 455 |                 *d = c2 - 0x1f; | 
| 456 |               } else if (0x60 <= c2 && c2 <= 0x79) { | 
| 457 |                 ctype = GRN_CHAR_ALPHA; | 
| 458 |                 *d = c2 + 0x01; | 
| 459 |               } else if (0x81 <= c2 && c2 <= 0x9a) { | 
| 460 |                 ctype = GRN_CHAR_ALPHA; | 
| 461 |                 *d = c2 - 0x20; | 
| 462 |               } else if (0x9f <= c2 && c2 <= 0xf1) { | 
| 463 |                 *d++ = c1; *d = c2; | 
| 464 |                 ctype = GRN_CHAR_HIRAGANA; | 
| 465 |               } else { | 
| 466 |                 *d++ = c1; *d = c2; | 
| 467 |                 ctype = GRN_CHAR_OTHERS; | 
| 468 |               } | 
| 469 |               break; | 
| 470 |             case 3 : | 
| 471 |               if (0x40 <= c2 && c2 <= 0x96) { | 
| 472 |                 *d++ = c1; *d = c2; | 
| 473 |                 ctype = GRN_CHAR_KATAKANA; | 
| 474 |               } else { | 
| 475 |                 *d++ = c1; *d = c2; | 
| 476 |                 ctype = GRN_CHAR_SYMBOL; | 
| 477 |               } | 
| 478 |               break; | 
| 479 |             case 4 : | 
| 480 |             case 7 : | 
| 481 |               *d++ = c1; *d = c2; | 
| 482 |               ctype = GRN_CHAR_SYMBOL; | 
| 483 |               break; | 
| 484 |             default : | 
| 485 |               *d++ = c1; *d = c2; | 
| 486 |               ctype = GRN_CHAR_OTHERS; | 
| 487 |               break; | 
| 488 |             } | 
| 489 |           } else { | 
| 490 |             *d++ = c1; *d = c2; | 
| 491 |             ctype = GRN_CHAR_KANJI; | 
| 492 |           } | 
| 493 |         } else { | 
| 494 |           /* skip invalid character */ | 
| 495 |           continue; | 
| 496 |         } | 
| 497 |       } | 
| 498 |     } else { | 
| 499 |       unsigned char c = *s; | 
| 500 |       switch (c >> 4) { | 
| 501 |       case 0 : | 
| 502 |       case 1 : | 
| 503 |         /* skip unprintable ascii */ | 
| 504 |         if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } | 
| 505 |         continue; | 
| 506 |       case 2 : | 
| 507 |         if (c == 0x20) { | 
| 508 |           if (removeblankp) { | 
| 509 |             if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } | 
| 510 |             continue; | 
| 511 |           } else { | 
| 512 |             *d = ' '; | 
| 513 |             ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL; | 
| 514 |           } | 
| 515 |         } else { | 
| 516 |           *d = c; | 
| 517 |           ctype = GRN_CHAR_SYMBOL; | 
| 518 |         } | 
| 519 |         break; | 
| 520 |       case 3 : | 
| 521 |         *d = c; | 
| 522 |         ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL; | 
| 523 |         break; | 
| 524 |       case 4 : | 
| 525 |         *d = ('A' <= c) ? c + 0x20 : c; | 
| 526 |         ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA; | 
| 527 |         break; | 
| 528 |       case 5 : | 
| 529 |         *d = (c <= 'Z') ? c + 0x20 : c; | 
| 530 |         ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL; | 
| 531 |         break; | 
| 532 |       case 6 : | 
| 533 |         *d = c; | 
| 534 |         ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA; | 
| 535 |         break; | 
| 536 |       case 7 : | 
| 537 |         *d = c; | 
| 538 |         ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL); | 
| 539 |         break; | 
| 540 |       default : | 
| 541 |         *d = c; | 
| 542 |         ctype = GRN_CHAR_OTHERS; | 
| 543 |         break; | 
| 544 |       } | 
| 545 |     } | 
| 546 |     d++; | 
| 547 |     length++; | 
| 548 |     if (cp) { *cp++ = ctype; } | 
| 549 |     if (ch) { | 
| 550 |       *ch++ = (int16_t)(s + 1 - s_); | 
| 551 |       s_ = s + 1; | 
| 552 |       while (++d_ < d) { *ch++ = 0; } | 
| 553 |     } | 
| 554 |   } | 
| 555 |   if (cp) { *cp = GRN_CHAR_NULL; } | 
| 556 |   *d = '\0'; | 
| 557 |   nstr->n_characters = length; | 
| 558 |   nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized); | 
| 559 |   return NULL; | 
| 560 | } | 
| 561 |  | 
| 562 | #ifdef GRN_WITH_NFKC | 
| 563 | static inline int | 
| 564 | grn_str_charlen_utf8(grn_ctx *ctx, const unsigned char *str, const unsigned char *end) | 
| 565 | { | 
| 566 |   /* MEMO: This function allows non-null-terminated string as str. */ | 
| 567 |   /*       But requires the end of string. */ | 
| 568 |   const unsigned char *p = str; | 
| 569 |   if (end <= p || !*p) { return 0; } | 
| 570 |   if (*p & 0x80) { | 
| 571 |     int b, w; | 
| 572 |     int size; | 
| 573 |     int i; | 
| 574 |     for (b = 0x40, w = 0; b && (*p & b); b >>= 1, w++); | 
| 575 |     if (!w) { | 
| 576 |       GRN_LOG(ctx, GRN_LOG_WARNING, | 
| 577 |               "invalid utf8 string: the first bit is 0x80: <%.*s>: <%.*s>" , | 
| 578 |               (int)(end - p), p, | 
| 579 |               (int)(end - str), str); | 
| 580 |       return 0; | 
| 581 |     } | 
| 582 |     size = w + 1; | 
| 583 |     for (i = 1; i < size; i++) { | 
| 584 |       if (++p >= end) { | 
| 585 |         GRN_LOG(ctx, GRN_LOG_WARNING, | 
| 586 |                 "invalid utf8 string: too short: "  | 
| 587 |                 "%d byte is required but %d byte is given: <%.*s>" , | 
| 588 |                 size, i, | 
| 589 |                 (int)(end - str), str); | 
| 590 |         return 0; | 
| 591 |       } | 
| 592 |       if (!*p) { | 
| 593 |         GRN_LOG(ctx, GRN_LOG_WARNING, | 
| 594 |                 "invalid utf8 string: NULL character is found: <%.*s>" , | 
| 595 |                 (int)(end - str), str); | 
| 596 |         return 0; | 
| 597 |       } | 
| 598 |       if ((*p & 0xc0) != 0x80) { | 
| 599 |         GRN_LOG(ctx, GRN_LOG_WARNING, | 
| 600 |                 "invalid utf8 string: 0x80 is not allowed: <%.*s>: <%.*s>" , | 
| 601 |                 (int)(end - p), p, | 
| 602 |                 (int)(end - str), str); | 
| 603 |         return 0; | 
| 604 |       } | 
| 605 |     } | 
| 606 |     return size; | 
| 607 |   } else { | 
| 608 |     return 1; | 
| 609 |   } | 
| 610 |   return 0; | 
| 611 | } | 
| 612 |  | 
| 613 | inline static grn_obj * | 
| 614 | utf8_normalize(grn_ctx *ctx, grn_string *nstr) | 
| 615 | { | 
| 616 |   int16_t *ch; | 
| 617 |   const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e; | 
| 618 |   unsigned char *d, *d_, *de; | 
| 619 |   uint_least8_t *cp; | 
| 620 |   size_t length = 0, ls, lp, size = nstr->original_length_in_bytes, ds = size * 3; | 
| 621 |   int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK; | 
| 622 |   grn_bool remove_tokenized_delimiter_p = | 
| 623 |     nstr->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER; | 
| 624 |   if (!(nstr->normalized = GRN_MALLOC(ds + 1))) { | 
| 625 |     ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 626 |         "[string][utf8] failed to allocate normalized text space" ); | 
| 627 |     return NULL; | 
| 628 |   } | 
| 629 |   if (nstr->flags & GRN_STRING_WITH_CHECKS) { | 
| 630 |     if (!(nstr->checks = GRN_MALLOC(ds * sizeof(int16_t) + 1))) { | 
| 631 |       GRN_FREE(nstr->normalized); | 
| 632 |       nstr->normalized = NULL; | 
| 633 |       ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 634 |           "[string][utf8] failed to allocate checks space" ); | 
| 635 |       return NULL; | 
| 636 |     } | 
| 637 |   } | 
| 638 |   ch = nstr->checks; | 
| 639 |   if (nstr->flags & GRN_STRING_WITH_TYPES) { | 
| 640 |     if (!(nstr->ctypes = GRN_MALLOC(ds + 1))) { | 
| 641 |       if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } | 
| 642 |       GRN_FREE(nstr->normalized); nstr->normalized = NULL; | 
| 643 |       ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 644 |           "[string][utf8] failed to allocate character types space" ); | 
| 645 |       return NULL; | 
| 646 |     } | 
| 647 |   } | 
| 648 |   cp = nstr->ctypes; | 
| 649 |   d = (unsigned char *)nstr->normalized; | 
| 650 |   de = d + ds; | 
| 651 |   d_ = NULL; | 
| 652 |   e = (unsigned char *)nstr->original + size; | 
| 653 |   for (s = s_ = (unsigned char *)nstr->original; ; s += ls) { | 
| 654 |     if (!(ls = grn_str_charlen_utf8(ctx, s, e))) { | 
| 655 |       break; | 
| 656 |     } | 
| 657 |     if (remove_tokenized_delimiter_p && | 
| 658 |         grn_tokenizer_is_tokenized_delimiter(ctx, (const char *)s, ls, | 
| 659 |                                              GRN_ENC_UTF8)) { | 
| 660 |       continue; | 
| 661 |     } | 
| 662 |     if ((p = (unsigned char *)grn_nfkc_decompose(s))) { | 
| 663 |       pe = p + strlen((char *)p); | 
| 664 |     } else { | 
| 665 |       p = s; | 
| 666 |       pe = p + ls; | 
| 667 |     } | 
| 668 |     if (d_ && (p2 = (unsigned char *)grn_nfkc_compose(d_, p))) { | 
| 669 |       p = p2; | 
| 670 |       pe = p + strlen((char *)p); | 
| 671 |       if (cp) { cp--; } | 
| 672 |       if (ch) { | 
| 673 |         ch -= (d - d_); | 
| 674 |         if (ch[0] >= 0) { | 
| 675 |           s_ = s__; | 
| 676 |         } | 
| 677 |       } | 
| 678 |       d = d_; | 
| 679 |       length--; | 
| 680 |     } | 
| 681 |     for (; ; p += lp) { | 
| 682 |       if (!(lp = grn_str_charlen_utf8(ctx, p, pe))) { | 
| 683 |         break; | 
| 684 |       } | 
| 685 |       if ((*p == ' ' && removeblankp) || *p < 0x20  /* skip unprintable ascii */ ) { | 
| 686 |         if (cp > nstr->ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } | 
| 687 |       } else { | 
| 688 |         if (de <= d + lp) { | 
| 689 |           unsigned char *normalized; | 
| 690 |           ds += (ds >> 1) + lp; | 
| 691 |           if (!(normalized = GRN_REALLOC(nstr->normalized, ds + 1))) { | 
| 692 |             if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; } | 
| 693 |             if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } | 
| 694 |             GRN_FREE(nstr->normalized); nstr->normalized = NULL; | 
| 695 |             ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 696 |                 "[string][utf8] failed to expand normalized text space" ); | 
| 697 |             return NULL; | 
| 698 |           } | 
| 699 |           de = normalized + ds; | 
| 700 |           d = normalized + (d - (unsigned char *)nstr->normalized); | 
| 701 |           nstr->normalized = (char *)normalized; | 
| 702 |           if (ch) { | 
| 703 |             int16_t *checks; | 
| 704 |             if (!(checks = GRN_REALLOC(nstr->checks, ds * sizeof(int16_t) + 1))) { | 
| 705 |               if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; } | 
| 706 |               GRN_FREE(nstr->checks); nstr->checks = NULL; | 
| 707 |               GRN_FREE(nstr->normalized); nstr->normalized = NULL; | 
| 708 |               ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 709 |                   "[string][utf8] failed to expand checks space" ); | 
| 710 |               return NULL; | 
| 711 |             } | 
| 712 |             ch = checks + (ch - nstr->checks); | 
| 713 |             nstr->checks = checks; | 
| 714 |           } | 
| 715 |           if (cp) { | 
| 716 |             uint_least8_t *ctypes; | 
| 717 |             if (!(ctypes = GRN_REALLOC(nstr->ctypes, ds + 1))) { | 
| 718 |               GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; | 
| 719 |               if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; } | 
| 720 |               GRN_FREE(nstr->normalized); nstr->normalized = NULL; | 
| 721 |               ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 722 |                   "[string][utf8] failed to expand character types space" ); | 
| 723 |               return NULL; | 
| 724 |             } | 
| 725 |             cp = ctypes + (cp - nstr->ctypes); | 
| 726 |             nstr->ctypes = ctypes; | 
| 727 |           } | 
| 728 |         } | 
| 729 |         grn_memcpy(d, p, lp); | 
| 730 |         d_ = d; | 
| 731 |         d += lp; | 
| 732 |         length++; | 
| 733 |         if (cp) { *cp++ = grn_nfkc_char_type(p); } | 
| 734 |         if (ch) { | 
| 735 |           size_t i; | 
| 736 |           if (s_ == s + ls) { | 
| 737 |             *ch++ = -1; | 
| 738 |           } else { | 
| 739 |             *ch++ = (int16_t)(s + ls - s_); | 
| 740 |             s__ = s_; | 
| 741 |             s_ = s + ls; | 
| 742 |           } | 
| 743 |           for (i = lp; i > 1; i--) { *ch++ = 0; } | 
| 744 |         } | 
| 745 |       } | 
| 746 |     } | 
| 747 |   } | 
| 748 |   if (cp) { *cp = GRN_CHAR_NULL; } | 
| 749 |   *d = '\0'; | 
| 750 |   nstr->n_characters = length; | 
| 751 |   nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized); | 
| 752 |   return NULL; | 
| 753 | } | 
| 754 | #endif /* GRN_WITH_NFKC */ | 
| 755 |  | 
| 756 | inline static grn_obj * | 
| 757 | ascii_normalize(grn_ctx *ctx, grn_string *nstr) | 
| 758 | { | 
| 759 |   int16_t *ch; | 
| 760 |   const unsigned char *s, *s_, *e; | 
| 761 |   unsigned char *d, *d0, *d_; | 
| 762 |   uint_least8_t *cp, *ctypes, ctype; | 
| 763 |   size_t size = nstr->original_length_in_bytes, length = 0; | 
| 764 |   int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK; | 
| 765 |   if (!(nstr->normalized = GRN_MALLOC(size + 1))) { | 
| 766 |     ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 767 |         "[string][ascii] failed to allocate normalized text space" ); | 
| 768 |     return NULL; | 
| 769 |   } | 
| 770 |   d0 = (unsigned char *) nstr->normalized; | 
| 771 |   if (nstr->flags & GRN_STRING_WITH_CHECKS) { | 
| 772 |     if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) { | 
| 773 |       GRN_FREE(nstr->normalized); | 
| 774 |       nstr->normalized = NULL; | 
| 775 |       ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 776 |           "[string][ascii] failed to allocate checks space" ); | 
| 777 |       return NULL; | 
| 778 |     } | 
| 779 |   } | 
| 780 |   ch = nstr->checks; | 
| 781 |   if (nstr->flags & GRN_STRING_WITH_TYPES) { | 
| 782 |     if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { | 
| 783 |       GRN_FREE(nstr->checks); | 
| 784 |       GRN_FREE(nstr->normalized); | 
| 785 |       nstr->checks = NULL; | 
| 786 |       nstr->normalized = NULL; | 
| 787 |       ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 788 |           "[string][ascii] failed to allocate character types space" ); | 
| 789 |       return NULL; | 
| 790 |     } | 
| 791 |   } | 
| 792 |   cp = ctypes = nstr->ctypes; | 
| 793 |   e = (unsigned char *)nstr->original + size; | 
| 794 |   for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) { | 
| 795 |     unsigned char c = *s; | 
| 796 |     switch (c >> 4) { | 
| 797 |     case 0 : | 
| 798 |     case 1 : | 
| 799 |       /* skip unprintable ascii */ | 
| 800 |       if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } | 
| 801 |       continue; | 
| 802 |     case 2 : | 
| 803 |       if (c == 0x20) { | 
| 804 |         if (removeblankp) { | 
| 805 |           if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } | 
| 806 |           continue; | 
| 807 |         } else { | 
| 808 |           *d = ' '; | 
| 809 |           ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL; | 
| 810 |         } | 
| 811 |       } else { | 
| 812 |         *d = c; | 
| 813 |         ctype = GRN_CHAR_SYMBOL; | 
| 814 |       } | 
| 815 |       break; | 
| 816 |     case 3 : | 
| 817 |       *d = c; | 
| 818 |       ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL; | 
| 819 |       break; | 
| 820 |     case 4 : | 
| 821 |       *d = ('A' <= c) ? c + 0x20 : c; | 
| 822 |       ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA; | 
| 823 |       break; | 
| 824 |     case 5 : | 
| 825 |       *d = (c <= 'Z') ? c + 0x20 : c; | 
| 826 |       ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL; | 
| 827 |       break; | 
| 828 |     case 6 : | 
| 829 |       *d = c; | 
| 830 |       ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA; | 
| 831 |       break; | 
| 832 |     case 7 : | 
| 833 |       *d = c; | 
| 834 |       ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL); | 
| 835 |       break; | 
| 836 |     default : | 
| 837 |       *d = c; | 
| 838 |       ctype = GRN_CHAR_OTHERS; | 
| 839 |       break; | 
| 840 |     } | 
| 841 |     d++; | 
| 842 |     length++; | 
| 843 |     if (cp) { *cp++ = ctype; } | 
| 844 |     if (ch) { | 
| 845 |       *ch++ = (int16_t)(s + 1 - s_); | 
| 846 |       s_ = s + 1; | 
| 847 |       while (++d_ < d) { *ch++ = 0; } | 
| 848 |     } | 
| 849 |   } | 
| 850 |   if (cp) { *cp = GRN_CHAR_NULL; } | 
| 851 |   *d = '\0'; | 
| 852 |   nstr->n_characters = length; | 
| 853 |   nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized); | 
| 854 |   return NULL; | 
| 855 | } | 
| 856 |  | 
| 857 | /* use cp1252 as latin1 */ | 
| 858 | inline static grn_obj * | 
| 859 | latin1_normalize(grn_ctx *ctx, grn_string *nstr) | 
| 860 | { | 
| 861 |   int16_t *ch; | 
| 862 |   const unsigned char *s, *s_, *e; | 
| 863 |   unsigned char *d, *d0, *d_; | 
| 864 |   uint_least8_t *cp, *ctypes, ctype; | 
| 865 |   size_t size = nstr->original_length_in_bytes, length = 0; | 
| 866 |   int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK; | 
| 867 |   if (!(nstr->normalized = GRN_MALLOC(size + 1))) { | 
| 868 |     ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 869 |         "[string][latin1] failed to allocate normalized text space" ); | 
| 870 |     return NULL; | 
| 871 |   } | 
| 872 |   d0 = (unsigned char *) nstr->normalized; | 
| 873 |   if (nstr->flags & GRN_STRING_WITH_CHECKS) { | 
| 874 |     if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) { | 
| 875 |       GRN_FREE(nstr->normalized); | 
| 876 |       nstr->normalized = NULL; | 
| 877 |       ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 878 |           "[string][latin1] failed to allocate checks space" ); | 
| 879 |       return NULL; | 
| 880 |     } | 
| 881 |   } | 
| 882 |   ch = nstr->checks; | 
| 883 |   if (nstr->flags & GRN_STRING_WITH_TYPES) { | 
| 884 |     if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { | 
| 885 |       GRN_FREE(nstr->checks); | 
| 886 |       GRN_FREE(nstr->normalized); | 
| 887 |       nstr->checks = NULL; | 
| 888 |       nstr->normalized = NULL; | 
| 889 |       ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 890 |           "[normalizer][latin1] failed to allocate character types space" ); | 
| 891 |       return NULL; | 
| 892 |     } | 
| 893 |   } | 
| 894 |   cp = ctypes = nstr->ctypes; | 
| 895 |   e = (unsigned char *)nstr->original + size; | 
| 896 |   for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) { | 
| 897 |     unsigned char c = *s; | 
| 898 |     switch (c >> 4) { | 
| 899 |     case 0 : | 
| 900 |     case 1 : | 
| 901 |       /* skip unprintable ascii */ | 
| 902 |       if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } | 
| 903 |       continue; | 
| 904 |     case 2 : | 
| 905 |       if (c == 0x20) { | 
| 906 |         if (removeblankp) { | 
| 907 |           if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } | 
| 908 |           continue; | 
| 909 |         } else { | 
| 910 |           *d = ' '; | 
| 911 |           ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL; | 
| 912 |         } | 
| 913 |       } else { | 
| 914 |         *d = c; | 
| 915 |         ctype = GRN_CHAR_SYMBOL; | 
| 916 |       } | 
| 917 |       break; | 
| 918 |     case 3 : | 
| 919 |       *d = c; | 
| 920 |       ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL; | 
| 921 |       break; | 
| 922 |     case 4 : | 
| 923 |       *d = ('A' <= c) ? c + 0x20 : c; | 
| 924 |       ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA; | 
| 925 |       break; | 
| 926 |     case 5 : | 
| 927 |       *d = (c <= 'Z') ? c + 0x20 : c; | 
| 928 |       ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL; | 
| 929 |       break; | 
| 930 |     case 6 : | 
| 931 |       *d = c; | 
| 932 |       ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA; | 
| 933 |       break; | 
| 934 |     case 7 : | 
| 935 |       *d = c; | 
| 936 |       ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL); | 
| 937 |       break; | 
| 938 |     case 8 : | 
| 939 |       if (c == 0x8a || c == 0x8c || c == 0x8e) { | 
| 940 |         *d = c + 0x10; | 
| 941 |         ctype = GRN_CHAR_ALPHA; | 
| 942 |       } else { | 
| 943 |         *d = c; | 
| 944 |         ctype = GRN_CHAR_SYMBOL; | 
| 945 |       } | 
| 946 |       break; | 
| 947 |     case 9 : | 
| 948 |       if (c == 0x9a || c == 0x9c || c == 0x9e || c == 0x9f) { | 
| 949 |         *d = (c == 0x9f) ? c + 0x60 : c; | 
| 950 |         ctype = GRN_CHAR_ALPHA; | 
| 951 |       } else { | 
| 952 |         *d = c; | 
| 953 |         ctype = GRN_CHAR_SYMBOL; | 
| 954 |       } | 
| 955 |       break; | 
| 956 |     case 0x0c : | 
| 957 |       *d = c + 0x20; | 
| 958 |       ctype = GRN_CHAR_ALPHA; | 
| 959 |       break; | 
| 960 |     case 0x0d : | 
| 961 |       *d = (c == 0xd7 || c == 0xdf) ? c : c + 0x20; | 
| 962 |       ctype = (c == 0xd7) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA; | 
| 963 |       break; | 
| 964 |     case 0x0e : | 
| 965 |       *d = c; | 
| 966 |       ctype = GRN_CHAR_ALPHA; | 
| 967 |       break; | 
| 968 |     case 0x0f : | 
| 969 |       *d = c; | 
| 970 |       ctype = (c == 0xf7) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA; | 
| 971 |       break; | 
| 972 |     default : | 
| 973 |       *d = c; | 
| 974 |       ctype = GRN_CHAR_OTHERS; | 
| 975 |       break; | 
| 976 |     } | 
| 977 |     d++; | 
| 978 |     length++; | 
| 979 |     if (cp) { *cp++ = ctype; } | 
| 980 |     if (ch) { | 
| 981 |       *ch++ = (int16_t)(s + 1 - s_); | 
| 982 |       s_ = s + 1; | 
| 983 |       while (++d_ < d) { *ch++ = 0; } | 
| 984 |     } | 
| 985 |   } | 
| 986 |   if (cp) { *cp = GRN_CHAR_NULL; } | 
| 987 |   *d = '\0'; | 
| 988 |   nstr->n_characters = length; | 
| 989 |   nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized); | 
| 990 |   return NULL; | 
| 991 | } | 
| 992 |  | 
| 993 | inline static grn_obj * | 
| 994 | koi8r_normalize(grn_ctx *ctx, grn_string *nstr) | 
| 995 | { | 
| 996 |   int16_t *ch; | 
| 997 |   const unsigned char *s, *s_, *e; | 
| 998 |   unsigned char *d, *d0, *d_; | 
| 999 |   uint_least8_t *cp, *ctypes, ctype; | 
| 1000 |   size_t size = nstr->original_length_in_bytes, length = 0; | 
| 1001 |   int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK; | 
| 1002 |   if (!(nstr->normalized = GRN_MALLOC(size + 1))) { | 
| 1003 |     ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 1004 |         "[string][koi8r] failed to allocate normalized text space" ); | 
| 1005 |     return NULL; | 
| 1006 |   } | 
| 1007 |   d0 = (unsigned char *) nstr->normalized; | 
| 1008 |   if (nstr->flags & GRN_STRING_WITH_CHECKS) { | 
| 1009 |     if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) { | 
| 1010 |       GRN_FREE(nstr->normalized); | 
| 1011 |       nstr->normalized = NULL; | 
| 1012 |       ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 1013 |           "[string][koi8r] failed to allocate checks space" ); | 
| 1014 |       return NULL; | 
| 1015 |     } | 
| 1016 |   } | 
| 1017 |   ch = nstr->checks; | 
| 1018 |   if (nstr->flags & GRN_STRING_WITH_TYPES) { | 
| 1019 |     if (!(nstr->ctypes = GRN_MALLOC(size + 1))) { | 
| 1020 |       GRN_FREE(nstr->checks); | 
| 1021 |       GRN_FREE(nstr->normalized); | 
| 1022 |       nstr->checks = NULL; | 
| 1023 |       nstr->normalized = NULL; | 
| 1024 |       ERR(GRN_NO_MEMORY_AVAILABLE, | 
| 1025 |           "[string][koi8r] failed to allocate character types space" ); | 
| 1026 |       return NULL; | 
| 1027 |     } | 
| 1028 |   } | 
| 1029 |   cp = ctypes = nstr->ctypes; | 
| 1030 |   e = (unsigned char *)nstr->original + size; | 
| 1031 |   for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) { | 
| 1032 |     unsigned char c = *s; | 
| 1033 |     switch (c >> 4) { | 
| 1034 |     case 0 : | 
| 1035 |     case 1 : | 
| 1036 |       /* skip unprintable ascii */ | 
| 1037 |       if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } | 
| 1038 |       continue; | 
| 1039 |     case 2 : | 
| 1040 |       if (c == 0x20) { | 
| 1041 |         if (removeblankp) { | 
| 1042 |           if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; } | 
| 1043 |           continue; | 
| 1044 |         } else { | 
| 1045 |           *d = ' '; | 
| 1046 |           ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL; | 
| 1047 |         } | 
| 1048 |       } else { | 
| 1049 |         *d = c; | 
| 1050 |         ctype = GRN_CHAR_SYMBOL; | 
| 1051 |       } | 
| 1052 |       break; | 
| 1053 |     case 3 : | 
| 1054 |       *d = c; | 
| 1055 |       ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL; | 
| 1056 |       break; | 
| 1057 |     case 4 : | 
| 1058 |       *d = ('A' <= c) ? c + 0x20 : c; | 
| 1059 |       ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA; | 
| 1060 |       break; | 
| 1061 |     case 5 : | 
| 1062 |       *d = (c <= 'Z') ? c + 0x20 : c; | 
| 1063 |       ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL; | 
| 1064 |       break; | 
| 1065 |     case 6 : | 
| 1066 |       *d = c; | 
| 1067 |       ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA; | 
| 1068 |       break; | 
| 1069 |     case 7 : | 
| 1070 |       *d = c; | 
| 1071 |       ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL); | 
| 1072 |       break; | 
| 1073 |     case 0x0a : | 
| 1074 |       *d = c; | 
| 1075 |       ctype = (c == 0xa3) ? GRN_CHAR_ALPHA : GRN_CHAR_OTHERS; | 
| 1076 |       break; | 
| 1077 |     case 0x0b : | 
| 1078 |       if (c == 0xb3) { | 
| 1079 |         *d = c - 0x10; | 
| 1080 |         ctype = GRN_CHAR_ALPHA; | 
| 1081 |       } else { | 
| 1082 |         *d = c; | 
| 1083 |         ctype = GRN_CHAR_OTHERS; | 
| 1084 |       } | 
| 1085 |       break; | 
| 1086 |     case 0x0c : | 
| 1087 |     case 0x0d : | 
| 1088 |       *d = c; | 
| 1089 |       ctype = GRN_CHAR_ALPHA; | 
| 1090 |       break; | 
| 1091 |     case 0x0e : | 
| 1092 |     case 0x0f : | 
| 1093 |       *d = c - 0x20; | 
| 1094 |       ctype = GRN_CHAR_ALPHA; | 
| 1095 |       break; | 
| 1096 |     default : | 
| 1097 |       *d = c; | 
| 1098 |       ctype = GRN_CHAR_OTHERS; | 
| 1099 |       break; | 
| 1100 |     } | 
| 1101 |     d++; | 
| 1102 |     length++; | 
| 1103 |     if (cp) { *cp++ = ctype; } | 
| 1104 |     if (ch) { | 
| 1105 |       *ch++ = (int16_t)(s + 1 - s_); | 
| 1106 |       s_ = s + 1; | 
| 1107 |       while (++d_ < d) { *ch++ = 0; } | 
| 1108 |     } | 
| 1109 |   } | 
| 1110 |   if (cp) { *cp = GRN_CHAR_NULL; } | 
| 1111 |   *d = '\0'; | 
| 1112 |   nstr->n_characters = length; | 
| 1113 |   nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized); | 
| 1114 |   return NULL; | 
| 1115 | } | 
| 1116 |  | 
| 1117 | static grn_obj * | 
| 1118 | auto_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) | 
| 1119 | { | 
| 1120 |   grn_string *string = (grn_string *)(args[0]); | 
| 1121 |   switch (string->encoding) { | 
| 1122 |   case GRN_ENC_EUC_JP : | 
| 1123 |     eucjp_normalize(ctx, string); | 
| 1124 |     break; | 
| 1125 |   case GRN_ENC_UTF8 : | 
| 1126 | #ifdef GRN_WITH_NFKC | 
| 1127 |     utf8_normalize(ctx, string); | 
| 1128 | #else /* GRN_WITH_NFKC */ | 
| 1129 |     ascii_normalize(ctx, string); | 
| 1130 | #endif /* GRN_WITH_NFKC */ | 
| 1131 |     break; | 
| 1132 |   case GRN_ENC_SJIS : | 
| 1133 |     sjis_normalize(ctx, string); | 
| 1134 |     break; | 
| 1135 |   case GRN_ENC_LATIN1 : | 
| 1136 |     latin1_normalize(ctx, string); | 
| 1137 |     break; | 
| 1138 |   case GRN_ENC_KOI8R : | 
| 1139 |     koi8r_normalize(ctx, string); | 
| 1140 |     break; | 
| 1141 |   default : | 
| 1142 |     ascii_normalize(ctx, string); | 
| 1143 |     break; | 
| 1144 |   } | 
| 1145 |   return NULL; | 
| 1146 | } | 
| 1147 |  | 
| 1148 | #ifdef GRN_WITH_NFKC | 
| 1149 | static grn_obj * | 
| 1150 | nfkc51_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) | 
| 1151 | { | 
| 1152 |   grn_string *string = (grn_string *)(args[0]); | 
| 1153 |   utf8_normalize(ctx, string); | 
| 1154 |   return NULL; | 
| 1155 | } | 
| 1156 | #endif /* GRN_WITH_NFKC */ | 
| 1157 |  | 
| 1158 | grn_rc | 
| 1159 | grn_normalizer_normalize(grn_ctx *ctx, grn_obj *normalizer, grn_obj *string) | 
| 1160 | { | 
| 1161 |   grn_rc rc; | 
| 1162 |   int nargs = 0; | 
| 1163 |  | 
| 1164 |   grn_ctx_push(ctx, string); | 
| 1165 |   nargs++; | 
| 1166 |   rc = grn_proc_call(ctx, normalizer, nargs, NULL); | 
| 1167 |   grn_ctx_pop(ctx); | 
| 1168 |  | 
| 1169 |   return rc; | 
| 1170 | } | 
| 1171 |  | 
| 1172 | grn_rc | 
| 1173 | grn_db_init_builtin_normalizers(grn_ctx *ctx) | 
| 1174 | { | 
| 1175 |   const char *normalizer_nfkc51_name = "NormalizerNFKC51" ; | 
| 1176 |  | 
| 1177 |   grn_normalizer_register(ctx, GRN_NORMALIZER_AUTO_NAME, -1, | 
| 1178 |                           NULL, auto_next, NULL); | 
| 1179 |  | 
| 1180 | #ifdef GRN_WITH_NFKC | 
| 1181 |   grn_normalizer_register(ctx, normalizer_nfkc51_name, -1, | 
| 1182 |                           NULL, nfkc51_next, NULL); | 
| 1183 | #else /* GRN_WITH_NFKC */ | 
| 1184 |   grn_normalizer_register(ctx, normalizer_nfkc51_name, -1, | 
| 1185 |                           NULL, NULL, NULL); | 
| 1186 | #endif /* GRN_WITH_NFKC */ | 
| 1187 | /* | 
| 1188 |   grn_normalizer_register(ctx, "NormalizerUCA", -1, | 
| 1189 |                           NULL, uca_next, NULL); | 
| 1190 | */ | 
| 1191 |  | 
| 1192 |   return GRN_SUCCESS; | 
| 1193 | } | 
| 1194 |  |