1/* -*- c-basic-offset: 2 -*- */
2/*
3 Copyright(C) 2012 Brazil
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License version 2.1 as published by the Free Software Foundation.
8
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public
15 License along with this library; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18
19#include <string.h>
20
21#include "grn_normalizer.h"
22#include "grn_string.h"
23#include "grn_nfkc.h"
24#include <groonga/normalizer.h>
25#include <groonga/tokenizer.h>
26
27grn_rc
28grn_normalizer_register(grn_ctx *ctx,
29 const char *name_ptr,
30 int name_length,
31 grn_proc_func *init,
32 grn_proc_func *next,
33 grn_proc_func *fin)
34{
35 grn_expr_var vars[] = {
36 { NULL, 0 }
37 };
38 GRN_PTR_INIT(&vars[0].value, 0, GRN_ID_NIL);
39
40 if (name_length < 0) {
41 name_length = strlen(name_ptr);
42 }
43
44 {
45 grn_obj * const normalizer = grn_proc_create(ctx,
46 name_ptr, name_length,
47 GRN_PROC_NORMALIZER,
48 init, next, fin,
49 sizeof(*vars) / sizeof(vars),
50 vars);
51 if (!normalizer) {
52 GRN_PLUGIN_ERROR(ctx, GRN_NORMALIZER_ERROR,
53 "[normalizer] failed to register normalizer: <%.*s>",
54 name_length, name_ptr);
55 return ctx->rc;
56 }
57 }
58 return GRN_SUCCESS;
59}
60
61grn_rc
62grn_normalizer_init(void)
63{
64 return GRN_SUCCESS;
65}
66
67grn_rc
68grn_normalizer_fin(void)
69{
70 return GRN_SUCCESS;
71}
72
73static unsigned char symbol[] = {
74 ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0,
75 0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0,
76 '"', '(', ')', 0, 0, '[', ']', '{', '}', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77 '+', '-', 0, 0, 0, '=', 0, '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78 '$', 0, 0, '%', '#', '&', '*', '@', 0, 0, 0, 0, 0, 0, 0, 0
79};
80
81inline static grn_obj *
82eucjp_normalize(grn_ctx *ctx, grn_string *nstr)
83{
84 static uint16_t hankana[] = {
85 0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3,
86 0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3, 0xa1bc, 0xa5a2,
87 0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad, 0xa5af, 0xa5b1, 0xa5b3,
88 0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd, 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6,
89 0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc, 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5,
90 0xa5d8, 0xa5db, 0xa5de, 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6,
91 0xa5e8, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab,
92 0xa1eb
93 };
94 static unsigned char dakuten[] = {
95 0xf4, 0, 0, 0, 0, 0xac, 0, 0xae, 0, 0xb0, 0, 0xb2, 0, 0xb4, 0, 0xb6, 0,
96 0xb8, 0, 0xba, 0, 0xbc, 0, 0xbe, 0, 0xc0, 0, 0xc2, 0, 0, 0xc5, 0, 0xc7,
97 0, 0xc9, 0, 0, 0, 0, 0, 0, 0xd0, 0, 0, 0xd3, 0, 0, 0xd6, 0, 0, 0xd9, 0,
98 0, 0xdc
99 };
100 static unsigned char handaku[] = {
101 0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd
102 };
103 int16_t *ch;
104 const unsigned char *s, *s_, *e;
105 unsigned char *d, *d0, *d_, b;
106 uint_least8_t *cp, *ctypes, ctype;
107 size_t size = nstr->original_length_in_bytes, length = 0;
108 int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
109 if (!(nstr->normalized = GRN_MALLOC(size * 2 + 1))) {
110 ERR(GRN_NO_MEMORY_AVAILABLE,
111 "[string][eucjp] failed to allocate normalized text space");
112 return NULL;
113 }
114 d0 = (unsigned char *) nstr->normalized;
115 if (nstr->flags & GRN_STRING_WITH_CHECKS) {
116 if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
117 GRN_FREE(nstr->normalized);
118 nstr->normalized = NULL;
119 ERR(GRN_NO_MEMORY_AVAILABLE,
120 "[string][eucjp] failed to allocate checks space");
121 return NULL;
122 }
123 }
124 ch = nstr->checks;
125 if (nstr->flags & GRN_STRING_WITH_TYPES) {
126 if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
127 GRN_FREE(nstr->checks);
128 GRN_FREE(nstr->normalized);
129 nstr->checks = NULL;
130 nstr->normalized = NULL;
131 ERR(GRN_NO_MEMORY_AVAILABLE,
132 "[string][eucjp] failed to allocate character types space");
133 return NULL;
134 }
135 }
136 cp = ctypes = nstr->ctypes;
137 e = (unsigned char *)nstr->original + size;
138 for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
139 if ((*s & 0x80)) {
140 if (((s + 1) < e) && (*(s + 1) & 0x80)) {
141 unsigned char c1 = *s++, c2 = *s, c3 = 0;
142 switch (c1 >> 4) {
143 case 0x08 :
144 if (c1 == 0x8e && 0xa0 <= c2 && c2 <= 0xdf) {
145 uint16_t c = hankana[c2 - 0xa0];
146 switch (c) {
147 case 0xa1ab :
148 if (d > d0 + 1 && d[-2] == 0xa5
149 && 0xa6 <= d[-1] && d[-1] <= 0xdb && (b = dakuten[d[-1] - 0xa6])) {
150 *(d - 1) = b;
151 if (ch) { ch[-1] += 2; s_ += 2; }
152 continue;
153 } else {
154 *d++ = c >> 8; *d = c & 0xff;
155 }
156 break;
157 case 0xa1eb :
158 if (d > d0 + 1 && d[-2] == 0xa5
159 && 0xcf <= d[-1] && d[-1] <= 0xdb && (b = handaku[d[-1] - 0xcf])) {
160 *(d - 1) = b;
161 if (ch) { ch[-1] += 2; s_ += 2; }
162 continue;
163 } else {
164 *d++ = c >> 8; *d = c & 0xff;
165 }
166 break;
167 default :
168 *d++ = c >> 8; *d = c & 0xff;
169 break;
170 }
171 ctype = GRN_CHAR_KATAKANA;
172 } else {
173 *d++ = c1; *d = c2;
174 ctype = GRN_CHAR_OTHERS;
175 }
176 break;
177 case 0x09 :
178 *d++ = c1; *d = c2;
179 ctype = GRN_CHAR_OTHERS;
180 break;
181 case 0x0a :
182 switch (c1 & 0x0f) {
183 case 1 :
184 switch (c2) {
185 case 0xbc :
186 *d++ = c1; *d = c2;
187 ctype = GRN_CHAR_KATAKANA;
188 break;
189 case 0xb9 :
190 *d++ = c1; *d = c2;
191 ctype = GRN_CHAR_KANJI;
192 break;
193 case 0xa1 :
194 if (removeblankp) {
195 if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
196 continue;
197 } else {
198 *d = ' ';
199 ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL;
200 }
201 break;
202 default :
203 if (c2 >= 0xa4 && (c3 = symbol[c2 - 0xa4])) {
204 *d = c3;
205 ctype = GRN_CHAR_SYMBOL;
206 } else {
207 *d++ = c1; *d = c2;
208 ctype = GRN_CHAR_OTHERS;
209 }
210 break;
211 }
212 break;
213 case 2 :
214 *d++ = c1; *d = c2;
215 ctype = GRN_CHAR_SYMBOL;
216 break;
217 case 3 :
218 c3 = c2 - 0x80;
219 if ('a' <= c3 && c3 <= 'z') {
220 ctype = GRN_CHAR_ALPHA;
221 *d = c3;
222 } else if ('A' <= c3 && c3 <= 'Z') {
223 ctype = GRN_CHAR_ALPHA;
224 *d = c3 + 0x20;
225 } else if ('0' <= c3 && c3 <= '9') {
226 ctype = GRN_CHAR_DIGIT;
227 *d = c3;
228 } else {
229 ctype = GRN_CHAR_OTHERS;
230 *d++ = c1; *d = c2;
231 }
232 break;
233 case 4 :
234 *d++ = c1; *d = c2;
235 ctype = GRN_CHAR_HIRAGANA;
236 break;
237 case 5 :
238 *d++ = c1; *d = c2;
239 ctype = GRN_CHAR_KATAKANA;
240 break;
241 case 6 :
242 case 7 :
243 case 8 :
244 *d++ = c1; *d = c2;
245 ctype = GRN_CHAR_SYMBOL;
246 break;
247 default :
248 *d++ = c1; *d = c2;
249 ctype = GRN_CHAR_OTHERS;
250 break;
251 }
252 break;
253 default :
254 *d++ = c1; *d = c2;
255 ctype = GRN_CHAR_KANJI;
256 break;
257 }
258 } else {
259 /* skip invalid character */
260 continue;
261 }
262 } else {
263 unsigned char c = *s;
264 switch (c >> 4) {
265 case 0 :
266 case 1 :
267 /* skip unprintable ascii */
268 if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
269 continue;
270 case 2 :
271 if (c == 0x20) {
272 if (removeblankp) {
273 if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
274 continue;
275 } else {
276 *d = ' ';
277 ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL;
278 }
279 } else {
280 *d = c;
281 ctype = GRN_CHAR_SYMBOL;
282 }
283 break;
284 case 3 :
285 *d = c;
286 ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL;
287 break;
288 case 4 :
289 *d = ('A' <= c) ? c + 0x20 : c;
290 ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
291 break;
292 case 5 :
293 *d = (c <= 'Z') ? c + 0x20 : c;
294 ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL;
295 break;
296 case 6 :
297 *d = c;
298 ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
299 break;
300 case 7 :
301 *d = c;
302 ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL);
303 break;
304 default :
305 *d = c;
306 ctype = GRN_CHAR_OTHERS;
307 break;
308 }
309 }
310 d++;
311 length++;
312 if (cp) { *cp++ = ctype; }
313 if (ch) {
314 *ch++ = (int16_t)(s + 1 - s_);
315 s_ = s + 1;
316 while (++d_ < d) { *ch++ = 0; }
317 }
318 }
319 if (cp) { *cp = GRN_CHAR_NULL; }
320 *d = '\0';
321 nstr->n_characters = length;
322 nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
323 return NULL;
324}
325
326inline static grn_obj *
327sjis_normalize(grn_ctx *ctx, grn_string *nstr)
328{
329 static uint16_t hankana[] = {
330 0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342,
331 0x8344, 0x8346, 0x8348, 0x8383, 0x8385, 0x8387, 0x8362, 0x815b, 0x8341,
332 0x8343, 0x8345, 0x8347, 0x8349, 0x834a, 0x834c, 0x834e, 0x8350, 0x8352,
333 0x8354, 0x8356, 0x8358, 0x835a, 0x835c, 0x835e, 0x8360, 0x8363, 0x8365,
334 0x8367, 0x8369, 0x836a, 0x836b, 0x836c, 0x836d, 0x836e, 0x8371, 0x8374,
335 0x8377, 0x837a, 0x837d, 0x837e, 0x8380, 0x8381, 0x8382, 0x8384, 0x8386,
336 0x8388, 0x8389, 0x838a, 0x838b, 0x838c, 0x838d, 0x838f, 0x8393, 0x814a,
337 0x814b
338 };
339 static unsigned char dakuten[] = {
340 0x94, 0, 0, 0, 0, 0x4b, 0, 0x4d, 0, 0x4f, 0, 0x51, 0, 0x53, 0, 0x55, 0,
341 0x57, 0, 0x59, 0, 0x5b, 0, 0x5d, 0, 0x5f, 0, 0x61, 0, 0, 0x64, 0, 0x66,
342 0, 0x68, 0, 0, 0, 0, 0, 0, 0x6f, 0, 0, 0x72, 0, 0, 0x75, 0, 0, 0x78, 0,
343 0, 0x7b
344 };
345 static unsigned char handaku[] = {
346 0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c
347 };
348 int16_t *ch;
349 const unsigned char *s, *s_;
350 unsigned char *d, *d0, *d_, b, *e;
351 uint_least8_t *cp, *ctypes, ctype;
352 size_t size = nstr->original_length_in_bytes, length = 0;
353 int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
354 if (!(nstr->normalized = GRN_MALLOC(size * 2 + 1))) {
355 ERR(GRN_NO_MEMORY_AVAILABLE,
356 "[string][sjis] failed to allocate normalized text space");
357 return NULL;
358 }
359 d0 = (unsigned char *) nstr->normalized;
360 if (nstr->flags & GRN_STRING_WITH_CHECKS) {
361 if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
362 GRN_FREE(nstr->normalized);
363 nstr->normalized = NULL;
364 ERR(GRN_NO_MEMORY_AVAILABLE,
365 "[string][sjis] failed to allocate checks space");
366 return NULL;
367 }
368 }
369 ch = nstr->checks;
370 if (nstr->flags & GRN_STRING_WITH_TYPES) {
371 if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
372 GRN_FREE(nstr->checks);
373 GRN_FREE(nstr->normalized);
374 nstr->checks = NULL;
375 nstr->normalized = NULL;
376 ERR(GRN_NO_MEMORY_AVAILABLE,
377 "[string][sjis] failed to allocate character types space");
378 return NULL;
379 }
380 }
381 cp = ctypes = nstr->ctypes;
382 e = (unsigned char *)nstr->original + size;
383 for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
384 if ((*s & 0x80)) {
385 if (0xa0 <= *s && *s <= 0xdf) {
386 uint16_t c = hankana[*s - 0xa0];
387 switch (c) {
388 case 0x814a :
389 if (d > d0 + 1 && d[-2] == 0x83
390 && 0x45 <= d[-1] && d[-1] <= 0x7a && (b = dakuten[d[-1] - 0x45])) {
391 *(d - 1) = b;
392 if (ch) { ch[-1]++; s_++; }
393 continue;
394 } else {
395 *d++ = c >> 8; *d = c & 0xff;
396 }
397 break;
398 case 0x814b :
399 if (d > d0 + 1 && d[-2] == 0x83
400 && 0x6e <= d[-1] && d[-1] <= 0x7a && (b = handaku[d[-1] - 0x6e])) {
401 *(d - 1) = b;
402 if (ch) { ch[-1]++; s_++; }
403 continue;
404 } else {
405 *d++ = c >> 8; *d = c & 0xff;
406 }
407 break;
408 default :
409 *d++ = c >> 8; *d = c & 0xff;
410 break;
411 }
412 ctype = GRN_CHAR_KATAKANA;
413 } else {
414 if ((s + 1) < e && 0x40 <= *(s + 1) && *(s + 1) <= 0xfc) {
415 unsigned char c1 = *s++, c2 = *s, c3 = 0;
416 if (0x81 <= c1 && c1 <= 0x87) {
417 switch (c1 & 0x0f) {
418 case 1 :
419 switch (c2) {
420 case 0x5b :
421 *d++ = c1; *d = c2;
422 ctype = GRN_CHAR_KATAKANA;
423 break;
424 case 0x58 :
425 *d++ = c1; *d = c2;
426 ctype = GRN_CHAR_KANJI;
427 break;
428 case 0x40 :
429 if (removeblankp) {
430 if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
431 continue;
432 } else {
433 *d = ' ';
434 ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL;
435 }
436 break;
437 default :
438 if (0x43 <= c2 && c2 <= 0x7e && (c3 = symbol[c2 - 0x43])) {
439 *d = c3;
440 ctype = GRN_CHAR_SYMBOL;
441 } else if (0x7f <= c2 && c2 <= 0x97 && (c3 = symbol[c2 - 0x44])) {
442 *d = c3;
443 ctype = GRN_CHAR_SYMBOL;
444 } else {
445 *d++ = c1; *d = c2;
446 ctype = GRN_CHAR_OTHERS;
447 }
448 break;
449 }
450 break;
451 case 2 :
452 c3 = c2 - 0x1f;
453 if (0x4f <= c2 && c2 <= 0x58) {
454 ctype = GRN_CHAR_DIGIT;
455 *d = c2 - 0x1f;
456 } else if (0x60 <= c2 && c2 <= 0x79) {
457 ctype = GRN_CHAR_ALPHA;
458 *d = c2 + 0x01;
459 } else if (0x81 <= c2 && c2 <= 0x9a) {
460 ctype = GRN_CHAR_ALPHA;
461 *d = c2 - 0x20;
462 } else if (0x9f <= c2 && c2 <= 0xf1) {
463 *d++ = c1; *d = c2;
464 ctype = GRN_CHAR_HIRAGANA;
465 } else {
466 *d++ = c1; *d = c2;
467 ctype = GRN_CHAR_OTHERS;
468 }
469 break;
470 case 3 :
471 if (0x40 <= c2 && c2 <= 0x96) {
472 *d++ = c1; *d = c2;
473 ctype = GRN_CHAR_KATAKANA;
474 } else {
475 *d++ = c1; *d = c2;
476 ctype = GRN_CHAR_SYMBOL;
477 }
478 break;
479 case 4 :
480 case 7 :
481 *d++ = c1; *d = c2;
482 ctype = GRN_CHAR_SYMBOL;
483 break;
484 default :
485 *d++ = c1; *d = c2;
486 ctype = GRN_CHAR_OTHERS;
487 break;
488 }
489 } else {
490 *d++ = c1; *d = c2;
491 ctype = GRN_CHAR_KANJI;
492 }
493 } else {
494 /* skip invalid character */
495 continue;
496 }
497 }
498 } else {
499 unsigned char c = *s;
500 switch (c >> 4) {
501 case 0 :
502 case 1 :
503 /* skip unprintable ascii */
504 if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
505 continue;
506 case 2 :
507 if (c == 0x20) {
508 if (removeblankp) {
509 if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
510 continue;
511 } else {
512 *d = ' ';
513 ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL;
514 }
515 } else {
516 *d = c;
517 ctype = GRN_CHAR_SYMBOL;
518 }
519 break;
520 case 3 :
521 *d = c;
522 ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL;
523 break;
524 case 4 :
525 *d = ('A' <= c) ? c + 0x20 : c;
526 ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
527 break;
528 case 5 :
529 *d = (c <= 'Z') ? c + 0x20 : c;
530 ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL;
531 break;
532 case 6 :
533 *d = c;
534 ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
535 break;
536 case 7 :
537 *d = c;
538 ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL);
539 break;
540 default :
541 *d = c;
542 ctype = GRN_CHAR_OTHERS;
543 break;
544 }
545 }
546 d++;
547 length++;
548 if (cp) { *cp++ = ctype; }
549 if (ch) {
550 *ch++ = (int16_t)(s + 1 - s_);
551 s_ = s + 1;
552 while (++d_ < d) { *ch++ = 0; }
553 }
554 }
555 if (cp) { *cp = GRN_CHAR_NULL; }
556 *d = '\0';
557 nstr->n_characters = length;
558 nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
559 return NULL;
560}
561
562#ifdef GRN_WITH_NFKC
563static inline int
564grn_str_charlen_utf8(grn_ctx *ctx, const unsigned char *str, const unsigned char *end)
565{
566 /* MEMO: This function allows non-null-terminated string as str. */
567 /* But requires the end of string. */
568 const unsigned char *p = str;
569 if (end <= p || !*p) { return 0; }
570 if (*p & 0x80) {
571 int b, w;
572 int size;
573 int i;
574 for (b = 0x40, w = 0; b && (*p & b); b >>= 1, w++);
575 if (!w) {
576 GRN_LOG(ctx, GRN_LOG_WARNING,
577 "invalid utf8 string: the first bit is 0x80: <%.*s>: <%.*s>",
578 (int)(end - p), p,
579 (int)(end - str), str);
580 return 0;
581 }
582 size = w + 1;
583 for (i = 1; i < size; i++) {
584 if (++p >= end) {
585 GRN_LOG(ctx, GRN_LOG_WARNING,
586 "invalid utf8 string: too short: "
587 "%d byte is required but %d byte is given: <%.*s>",
588 size, i,
589 (int)(end - str), str);
590 return 0;
591 }
592 if (!*p) {
593 GRN_LOG(ctx, GRN_LOG_WARNING,
594 "invalid utf8 string: NULL character is found: <%.*s>",
595 (int)(end - str), str);
596 return 0;
597 }
598 if ((*p & 0xc0) != 0x80) {
599 GRN_LOG(ctx, GRN_LOG_WARNING,
600 "invalid utf8 string: 0x80 is not allowed: <%.*s>: <%.*s>",
601 (int)(end - p), p,
602 (int)(end - str), str);
603 return 0;
604 }
605 }
606 return size;
607 } else {
608 return 1;
609 }
610 return 0;
611}
612
613inline static grn_obj *
614utf8_normalize(grn_ctx *ctx, grn_string *nstr)
615{
616 int16_t *ch;
617 const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e;
618 unsigned char *d, *d_, *de;
619 uint_least8_t *cp;
620 size_t length = 0, ls, lp, size = nstr->original_length_in_bytes, ds = size * 3;
621 int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
622 grn_bool remove_tokenized_delimiter_p =
623 nstr->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER;
624 if (!(nstr->normalized = GRN_MALLOC(ds + 1))) {
625 ERR(GRN_NO_MEMORY_AVAILABLE,
626 "[string][utf8] failed to allocate normalized text space");
627 return NULL;
628 }
629 if (nstr->flags & GRN_STRING_WITH_CHECKS) {
630 if (!(nstr->checks = GRN_MALLOC(ds * sizeof(int16_t) + 1))) {
631 GRN_FREE(nstr->normalized);
632 nstr->normalized = NULL;
633 ERR(GRN_NO_MEMORY_AVAILABLE,
634 "[string][utf8] failed to allocate checks space");
635 return NULL;
636 }
637 }
638 ch = nstr->checks;
639 if (nstr->flags & GRN_STRING_WITH_TYPES) {
640 if (!(nstr->ctypes = GRN_MALLOC(ds + 1))) {
641 if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
642 GRN_FREE(nstr->normalized); nstr->normalized = NULL;
643 ERR(GRN_NO_MEMORY_AVAILABLE,
644 "[string][utf8] failed to allocate character types space");
645 return NULL;
646 }
647 }
648 cp = nstr->ctypes;
649 d = (unsigned char *)nstr->normalized;
650 de = d + ds;
651 d_ = NULL;
652 e = (unsigned char *)nstr->original + size;
653 for (s = s_ = (unsigned char *)nstr->original; ; s += ls) {
654 if (!(ls = grn_str_charlen_utf8(ctx, s, e))) {
655 break;
656 }
657 if (remove_tokenized_delimiter_p &&
658 grn_tokenizer_is_tokenized_delimiter(ctx, (const char *)s, ls,
659 GRN_ENC_UTF8)) {
660 continue;
661 }
662 if ((p = (unsigned char *)grn_nfkc_decompose(s))) {
663 pe = p + strlen((char *)p);
664 } else {
665 p = s;
666 pe = p + ls;
667 }
668 if (d_ && (p2 = (unsigned char *)grn_nfkc_compose(d_, p))) {
669 p = p2;
670 pe = p + strlen((char *)p);
671 if (cp) { cp--; }
672 if (ch) {
673 ch -= (d - d_);
674 if (ch[0] >= 0) {
675 s_ = s__;
676 }
677 }
678 d = d_;
679 length--;
680 }
681 for (; ; p += lp) {
682 if (!(lp = grn_str_charlen_utf8(ctx, p, pe))) {
683 break;
684 }
685 if ((*p == ' ' && removeblankp) || *p < 0x20 /* skip unprintable ascii */ ) {
686 if (cp > nstr->ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
687 } else {
688 if (de <= d + lp) {
689 unsigned char *normalized;
690 ds += (ds >> 1) + lp;
691 if (!(normalized = GRN_REALLOC(nstr->normalized, ds + 1))) {
692 if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
693 if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
694 GRN_FREE(nstr->normalized); nstr->normalized = NULL;
695 ERR(GRN_NO_MEMORY_AVAILABLE,
696 "[string][utf8] failed to expand normalized text space");
697 return NULL;
698 }
699 de = normalized + ds;
700 d = normalized + (d - (unsigned char *)nstr->normalized);
701 nstr->normalized = (char *)normalized;
702 if (ch) {
703 int16_t *checks;
704 if (!(checks = GRN_REALLOC(nstr->checks, ds * sizeof(int16_t) + 1))) {
705 if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
706 GRN_FREE(nstr->checks); nstr->checks = NULL;
707 GRN_FREE(nstr->normalized); nstr->normalized = NULL;
708 ERR(GRN_NO_MEMORY_AVAILABLE,
709 "[string][utf8] failed to expand checks space");
710 return NULL;
711 }
712 ch = checks + (ch - nstr->checks);
713 nstr->checks = checks;
714 }
715 if (cp) {
716 uint_least8_t *ctypes;
717 if (!(ctypes = GRN_REALLOC(nstr->ctypes, ds + 1))) {
718 GRN_FREE(nstr->ctypes); nstr->ctypes = NULL;
719 if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
720 GRN_FREE(nstr->normalized); nstr->normalized = NULL;
721 ERR(GRN_NO_MEMORY_AVAILABLE,
722 "[string][utf8] failed to expand character types space");
723 return NULL;
724 }
725 cp = ctypes + (cp - nstr->ctypes);
726 nstr->ctypes = ctypes;
727 }
728 }
729 grn_memcpy(d, p, lp);
730 d_ = d;
731 d += lp;
732 length++;
733 if (cp) { *cp++ = grn_nfkc_char_type(p); }
734 if (ch) {
735 size_t i;
736 if (s_ == s + ls) {
737 *ch++ = -1;
738 } else {
739 *ch++ = (int16_t)(s + ls - s_);
740 s__ = s_;
741 s_ = s + ls;
742 }
743 for (i = lp; i > 1; i--) { *ch++ = 0; }
744 }
745 }
746 }
747 }
748 if (cp) { *cp = GRN_CHAR_NULL; }
749 *d = '\0';
750 nstr->n_characters = length;
751 nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
752 return NULL;
753}
754#endif /* GRN_WITH_NFKC */
755
756inline static grn_obj *
757ascii_normalize(grn_ctx *ctx, grn_string *nstr)
758{
759 int16_t *ch;
760 const unsigned char *s, *s_, *e;
761 unsigned char *d, *d0, *d_;
762 uint_least8_t *cp, *ctypes, ctype;
763 size_t size = nstr->original_length_in_bytes, length = 0;
764 int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
765 if (!(nstr->normalized = GRN_MALLOC(size + 1))) {
766 ERR(GRN_NO_MEMORY_AVAILABLE,
767 "[string][ascii] failed to allocate normalized text space");
768 return NULL;
769 }
770 d0 = (unsigned char *) nstr->normalized;
771 if (nstr->flags & GRN_STRING_WITH_CHECKS) {
772 if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
773 GRN_FREE(nstr->normalized);
774 nstr->normalized = NULL;
775 ERR(GRN_NO_MEMORY_AVAILABLE,
776 "[string][ascii] failed to allocate checks space");
777 return NULL;
778 }
779 }
780 ch = nstr->checks;
781 if (nstr->flags & GRN_STRING_WITH_TYPES) {
782 if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
783 GRN_FREE(nstr->checks);
784 GRN_FREE(nstr->normalized);
785 nstr->checks = NULL;
786 nstr->normalized = NULL;
787 ERR(GRN_NO_MEMORY_AVAILABLE,
788 "[string][ascii] failed to allocate character types space");
789 return NULL;
790 }
791 }
792 cp = ctypes = nstr->ctypes;
793 e = (unsigned char *)nstr->original + size;
794 for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
795 unsigned char c = *s;
796 switch (c >> 4) {
797 case 0 :
798 case 1 :
799 /* skip unprintable ascii */
800 if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
801 continue;
802 case 2 :
803 if (c == 0x20) {
804 if (removeblankp) {
805 if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
806 continue;
807 } else {
808 *d = ' ';
809 ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL;
810 }
811 } else {
812 *d = c;
813 ctype = GRN_CHAR_SYMBOL;
814 }
815 break;
816 case 3 :
817 *d = c;
818 ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL;
819 break;
820 case 4 :
821 *d = ('A' <= c) ? c + 0x20 : c;
822 ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
823 break;
824 case 5 :
825 *d = (c <= 'Z') ? c + 0x20 : c;
826 ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL;
827 break;
828 case 6 :
829 *d = c;
830 ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
831 break;
832 case 7 :
833 *d = c;
834 ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL);
835 break;
836 default :
837 *d = c;
838 ctype = GRN_CHAR_OTHERS;
839 break;
840 }
841 d++;
842 length++;
843 if (cp) { *cp++ = ctype; }
844 if (ch) {
845 *ch++ = (int16_t)(s + 1 - s_);
846 s_ = s + 1;
847 while (++d_ < d) { *ch++ = 0; }
848 }
849 }
850 if (cp) { *cp = GRN_CHAR_NULL; }
851 *d = '\0';
852 nstr->n_characters = length;
853 nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
854 return NULL;
855}
856
857/* use cp1252 as latin1 */
858inline static grn_obj *
859latin1_normalize(grn_ctx *ctx, grn_string *nstr)
860{
861 int16_t *ch;
862 const unsigned char *s, *s_, *e;
863 unsigned char *d, *d0, *d_;
864 uint_least8_t *cp, *ctypes, ctype;
865 size_t size = nstr->original_length_in_bytes, length = 0;
866 int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
867 if (!(nstr->normalized = GRN_MALLOC(size + 1))) {
868 ERR(GRN_NO_MEMORY_AVAILABLE,
869 "[string][latin1] failed to allocate normalized text space");
870 return NULL;
871 }
872 d0 = (unsigned char *) nstr->normalized;
873 if (nstr->flags & GRN_STRING_WITH_CHECKS) {
874 if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
875 GRN_FREE(nstr->normalized);
876 nstr->normalized = NULL;
877 ERR(GRN_NO_MEMORY_AVAILABLE,
878 "[string][latin1] failed to allocate checks space");
879 return NULL;
880 }
881 }
882 ch = nstr->checks;
883 if (nstr->flags & GRN_STRING_WITH_TYPES) {
884 if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
885 GRN_FREE(nstr->checks);
886 GRN_FREE(nstr->normalized);
887 nstr->checks = NULL;
888 nstr->normalized = NULL;
889 ERR(GRN_NO_MEMORY_AVAILABLE,
890 "[normalizer][latin1] failed to allocate character types space");
891 return NULL;
892 }
893 }
894 cp = ctypes = nstr->ctypes;
895 e = (unsigned char *)nstr->original + size;
896 for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
897 unsigned char c = *s;
898 switch (c >> 4) {
899 case 0 :
900 case 1 :
901 /* skip unprintable ascii */
902 if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
903 continue;
904 case 2 :
905 if (c == 0x20) {
906 if (removeblankp) {
907 if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
908 continue;
909 } else {
910 *d = ' ';
911 ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL;
912 }
913 } else {
914 *d = c;
915 ctype = GRN_CHAR_SYMBOL;
916 }
917 break;
918 case 3 :
919 *d = c;
920 ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL;
921 break;
922 case 4 :
923 *d = ('A' <= c) ? c + 0x20 : c;
924 ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
925 break;
926 case 5 :
927 *d = (c <= 'Z') ? c + 0x20 : c;
928 ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL;
929 break;
930 case 6 :
931 *d = c;
932 ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
933 break;
934 case 7 :
935 *d = c;
936 ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL);
937 break;
938 case 8 :
939 if (c == 0x8a || c == 0x8c || c == 0x8e) {
940 *d = c + 0x10;
941 ctype = GRN_CHAR_ALPHA;
942 } else {
943 *d = c;
944 ctype = GRN_CHAR_SYMBOL;
945 }
946 break;
947 case 9 :
948 if (c == 0x9a || c == 0x9c || c == 0x9e || c == 0x9f) {
949 *d = (c == 0x9f) ? c + 0x60 : c;
950 ctype = GRN_CHAR_ALPHA;
951 } else {
952 *d = c;
953 ctype = GRN_CHAR_SYMBOL;
954 }
955 break;
956 case 0x0c :
957 *d = c + 0x20;
958 ctype = GRN_CHAR_ALPHA;
959 break;
960 case 0x0d :
961 *d = (c == 0xd7 || c == 0xdf) ? c : c + 0x20;
962 ctype = (c == 0xd7) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
963 break;
964 case 0x0e :
965 *d = c;
966 ctype = GRN_CHAR_ALPHA;
967 break;
968 case 0x0f :
969 *d = c;
970 ctype = (c == 0xf7) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
971 break;
972 default :
973 *d = c;
974 ctype = GRN_CHAR_OTHERS;
975 break;
976 }
977 d++;
978 length++;
979 if (cp) { *cp++ = ctype; }
980 if (ch) {
981 *ch++ = (int16_t)(s + 1 - s_);
982 s_ = s + 1;
983 while (++d_ < d) { *ch++ = 0; }
984 }
985 }
986 if (cp) { *cp = GRN_CHAR_NULL; }
987 *d = '\0';
988 nstr->n_characters = length;
989 nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
990 return NULL;
991}
992
993inline static grn_obj *
994koi8r_normalize(grn_ctx *ctx, grn_string *nstr)
995{
996 int16_t *ch;
997 const unsigned char *s, *s_, *e;
998 unsigned char *d, *d0, *d_;
999 uint_least8_t *cp, *ctypes, ctype;
1000 size_t size = nstr->original_length_in_bytes, length = 0;
1001 int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
1002 if (!(nstr->normalized = GRN_MALLOC(size + 1))) {
1003 ERR(GRN_NO_MEMORY_AVAILABLE,
1004 "[string][koi8r] failed to allocate normalized text space");
1005 return NULL;
1006 }
1007 d0 = (unsigned char *) nstr->normalized;
1008 if (nstr->flags & GRN_STRING_WITH_CHECKS) {
1009 if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
1010 GRN_FREE(nstr->normalized);
1011 nstr->normalized = NULL;
1012 ERR(GRN_NO_MEMORY_AVAILABLE,
1013 "[string][koi8r] failed to allocate checks space");
1014 return NULL;
1015 }
1016 }
1017 ch = nstr->checks;
1018 if (nstr->flags & GRN_STRING_WITH_TYPES) {
1019 if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
1020 GRN_FREE(nstr->checks);
1021 GRN_FREE(nstr->normalized);
1022 nstr->checks = NULL;
1023 nstr->normalized = NULL;
1024 ERR(GRN_NO_MEMORY_AVAILABLE,
1025 "[string][koi8r] failed to allocate character types space");
1026 return NULL;
1027 }
1028 }
1029 cp = ctypes = nstr->ctypes;
1030 e = (unsigned char *)nstr->original + size;
1031 for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
1032 unsigned char c = *s;
1033 switch (c >> 4) {
1034 case 0 :
1035 case 1 :
1036 /* skip unprintable ascii */
1037 if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
1038 continue;
1039 case 2 :
1040 if (c == 0x20) {
1041 if (removeblankp) {
1042 if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
1043 continue;
1044 } else {
1045 *d = ' ';
1046 ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL;
1047 }
1048 } else {
1049 *d = c;
1050 ctype = GRN_CHAR_SYMBOL;
1051 }
1052 break;
1053 case 3 :
1054 *d = c;
1055 ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL;
1056 break;
1057 case 4 :
1058 *d = ('A' <= c) ? c + 0x20 : c;
1059 ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
1060 break;
1061 case 5 :
1062 *d = (c <= 'Z') ? c + 0x20 : c;
1063 ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL;
1064 break;
1065 case 6 :
1066 *d = c;
1067 ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
1068 break;
1069 case 7 :
1070 *d = c;
1071 ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL);
1072 break;
1073 case 0x0a :
1074 *d = c;
1075 ctype = (c == 0xa3) ? GRN_CHAR_ALPHA : GRN_CHAR_OTHERS;
1076 break;
1077 case 0x0b :
1078 if (c == 0xb3) {
1079 *d = c - 0x10;
1080 ctype = GRN_CHAR_ALPHA;
1081 } else {
1082 *d = c;
1083 ctype = GRN_CHAR_OTHERS;
1084 }
1085 break;
1086 case 0x0c :
1087 case 0x0d :
1088 *d = c;
1089 ctype = GRN_CHAR_ALPHA;
1090 break;
1091 case 0x0e :
1092 case 0x0f :
1093 *d = c - 0x20;
1094 ctype = GRN_CHAR_ALPHA;
1095 break;
1096 default :
1097 *d = c;
1098 ctype = GRN_CHAR_OTHERS;
1099 break;
1100 }
1101 d++;
1102 length++;
1103 if (cp) { *cp++ = ctype; }
1104 if (ch) {
1105 *ch++ = (int16_t)(s + 1 - s_);
1106 s_ = s + 1;
1107 while (++d_ < d) { *ch++ = 0; }
1108 }
1109 }
1110 if (cp) { *cp = GRN_CHAR_NULL; }
1111 *d = '\0';
1112 nstr->n_characters = length;
1113 nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
1114 return NULL;
1115}
1116
1117static grn_obj *
1118auto_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
1119{
1120 grn_string *string = (grn_string *)(args[0]);
1121 switch (string->encoding) {
1122 case GRN_ENC_EUC_JP :
1123 eucjp_normalize(ctx, string);
1124 break;
1125 case GRN_ENC_UTF8 :
1126#ifdef GRN_WITH_NFKC
1127 utf8_normalize(ctx, string);
1128#else /* GRN_WITH_NFKC */
1129 ascii_normalize(ctx, string);
1130#endif /* GRN_WITH_NFKC */
1131 break;
1132 case GRN_ENC_SJIS :
1133 sjis_normalize(ctx, string);
1134 break;
1135 case GRN_ENC_LATIN1 :
1136 latin1_normalize(ctx, string);
1137 break;
1138 case GRN_ENC_KOI8R :
1139 koi8r_normalize(ctx, string);
1140 break;
1141 default :
1142 ascii_normalize(ctx, string);
1143 break;
1144 }
1145 return NULL;
1146}
1147
1148#ifdef GRN_WITH_NFKC
1149static grn_obj *
1150nfkc51_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
1151{
1152 grn_string *string = (grn_string *)(args[0]);
1153 utf8_normalize(ctx, string);
1154 return NULL;
1155}
1156#endif /* GRN_WITH_NFKC */
1157
1158grn_rc
1159grn_normalizer_normalize(grn_ctx *ctx, grn_obj *normalizer, grn_obj *string)
1160{
1161 grn_rc rc;
1162 int nargs = 0;
1163
1164 grn_ctx_push(ctx, string);
1165 nargs++;
1166 rc = grn_proc_call(ctx, normalizer, nargs, NULL);
1167 grn_ctx_pop(ctx);
1168
1169 return rc;
1170}
1171
1172grn_rc
1173grn_db_init_builtin_normalizers(grn_ctx *ctx)
1174{
1175 const char *normalizer_nfkc51_name = "NormalizerNFKC51";
1176
1177 grn_normalizer_register(ctx, GRN_NORMALIZER_AUTO_NAME, -1,
1178 NULL, auto_next, NULL);
1179
1180#ifdef GRN_WITH_NFKC
1181 grn_normalizer_register(ctx, normalizer_nfkc51_name, -1,
1182 NULL, nfkc51_next, NULL);
1183#else /* GRN_WITH_NFKC */
1184 grn_normalizer_register(ctx, normalizer_nfkc51_name, -1,
1185 NULL, NULL, NULL);
1186#endif /* GRN_WITH_NFKC */
1187/*
1188 grn_normalizer_register(ctx, "NormalizerUCA", -1,
1189 NULL, uca_next, NULL);
1190*/
1191
1192 return GRN_SUCCESS;
1193}
1194