1/* Copyright (c) 2000, 2013, Oracle and/or its affiliates.
2 Copyright (c) 2009, 2014, SkySQL Ab.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
16
17#include "strings_def.h"
18#include <m_ctype.h>
19#include <my_xml.h>
20
21/*
22
23 This files implements routines which parse XML based
24 character set and collation description files.
25
26 Unicode collations are encoded according to
27
28 Unicode Technical Standard #35
29 Locale Data Markup Language (LDML)
30 http://www.unicode.org/reports/tr35/
31
32 and converted into ICU string according to
33
34 Collation Customization
35 http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
36
37*/
38
39
40/*
41 Avoid using my_snprintf
42 We cannot use my_snprintf() here, because ctype.o is
43 used to build conf_to_src, which must require minimum
44 dependency.
45*/
46
47#undef my_snprinf
48#define my_snprintf "We cannot use my_snprintf in this file"
49
50
51int (*my_string_stack_guard)(int)= NULL;
52
53static char *mstr(char *str,const char *src,size_t l1,size_t l2)
54{
55 l1= l1<l2 ? l1 : l2;
56 memcpy(str,src,l1);
57 str[l1]='\0';
58 return str;
59}
60
61struct my_cs_file_section_st
62{
63 int state;
64 const char *str;
65};
66
67#define _CS_MISC 1
68#define _CS_ID 2
69#define _CS_CSNAME 3
70#define _CS_FAMILY 4
71#define _CS_ORDER 5
72#define _CS_COLNAME 6
73#define _CS_FLAG 7
74#define _CS_CHARSET 8
75#define _CS_COLLATION 9
76#define _CS_UPPERMAP 10
77#define _CS_LOWERMAP 11
78#define _CS_UNIMAP 12
79#define _CS_COLLMAP 13
80#define _CS_CTYPEMAP 14
81#define _CS_PRIMARY_ID 15
82#define _CS_BINARY_ID 16
83#define _CS_CSDESCRIPT 17
84
85
86/* Special purpose commands */
87#define _CS_UCA_VERSION 100
88#define _CS_CL_SUPPRESS_CONTRACTIONS 101
89#define _CS_CL_OPTIMIZE 102
90#define _CS_CL_SHIFT_AFTER_METHOD 103
91#define _CS_CL_RULES_IMPORT 104
92#define _CS_CL_RULES_IMPORT_SOURCE 105
93
94
95/* Collation Settings */
96#define _CS_ST_SETTINGS 200
97#define _CS_ST_STRENGTH 201
98#define _CS_ST_ALTERNATE 202
99#define _CS_ST_BACKWARDS 203
100#define _CS_ST_NORMALIZATION 204
101#define _CS_ST_CASE_LEVEL 205
102#define _CS_ST_CASE_FIRST 206
103#define _CS_ST_HIRAGANA_QUATERNARY 207
104#define _CS_ST_NUMERIC 208
105#define _CS_ST_VARIABLE_TOP 209
106#define _CS_ST_MATCH_BOUNDARIES 210
107#define _CS_ST_MATCH_STYLE 211
108
109
110/* Rules */
111#define _CS_RULES 300
112#define _CS_RESET 301
113#define _CS_DIFF1 302
114#define _CS_DIFF2 303
115#define _CS_DIFF3 304
116#define _CS_DIFF4 305
117#define _CS_IDENTICAL 306
118
119/* Rules: Expansions */
120#define _CS_EXP_X 320
121#define _CS_EXP_EXTEND 321
122#define _CS_EXP_DIFF1 322
123#define _CS_EXP_DIFF2 323
124#define _CS_EXP_DIFF3 324
125#define _CS_EXP_DIFF4 325
126#define _CS_EXP_IDENTICAL 326
127
128/* Rules: Abbreviating Ordering Specifications */
129#define _CS_A_DIFF1 351
130#define _CS_A_DIFF2 352
131#define _CS_A_DIFF3 353
132#define _CS_A_DIFF4 354
133#define _CS_A_IDENTICAL 355
134
135/* Rules: previous context */
136#define _CS_CONTEXT 370
137
138/* Rules: Placing Characters Before Others*/
139#define _CS_RESET_BEFORE 380
140
141/* Rules: Logical Reset Positions */
142#define _CS_RESET_FIRST_PRIMARY_IGNORABLE 401
143#define _CS_RESET_LAST_PRIMARY_IGNORABLE 402
144#define _CS_RESET_FIRST_SECONDARY_IGNORABLE 403
145#define _CS_RESET_LAST_SECONDARY_IGNORABLE 404
146#define _CS_RESET_FIRST_TERTIARY_IGNORABLE 405
147#define _CS_RESET_LAST_TERTIARY_IGNORABLE 406
148#define _CS_RESET_FIRST_TRAILING 407
149#define _CS_RESET_LAST_TRAILING 408
150#define _CS_RESET_FIRST_VARIABLE 409
151#define _CS_RESET_LAST_VARIABLE 410
152#define _CS_RESET_FIRST_NON_IGNORABLE 411
153#define _CS_RESET_LAST_NON_IGNORABLE 412
154
155
156
157static const struct my_cs_file_section_st sec[] =
158{
159 {_CS_MISC, "xml"},
160 {_CS_MISC, "xml/version"},
161 {_CS_MISC, "xml/encoding"},
162 {_CS_MISC, "charsets"},
163 {_CS_MISC, "charsets/max-id"},
164 {_CS_MISC, "charsets/copyright"},
165 {_CS_MISC, "charsets/description"},
166 {_CS_CHARSET, "charsets/charset"},
167 {_CS_PRIMARY_ID, "charsets/charset/primary-id"},
168 {_CS_BINARY_ID, "charsets/charset/binary-id"},
169 {_CS_CSNAME, "charsets/charset/name"},
170 {_CS_FAMILY, "charsets/charset/family"},
171 {_CS_CSDESCRIPT, "charsets/charset/description"},
172 {_CS_MISC, "charsets/charset/alias"},
173 {_CS_MISC, "charsets/charset/ctype"},
174 {_CS_CTYPEMAP, "charsets/charset/ctype/map"},
175 {_CS_MISC, "charsets/charset/upper"},
176 {_CS_UPPERMAP, "charsets/charset/upper/map"},
177 {_CS_MISC, "charsets/charset/lower"},
178 {_CS_LOWERMAP, "charsets/charset/lower/map"},
179 {_CS_MISC, "charsets/charset/unicode"},
180 {_CS_UNIMAP, "charsets/charset/unicode/map"},
181 {_CS_COLLATION, "charsets/charset/collation"},
182 {_CS_COLNAME, "charsets/charset/collation/name"},
183 {_CS_ID, "charsets/charset/collation/id"},
184 {_CS_ORDER, "charsets/charset/collation/order"},
185 {_CS_FLAG, "charsets/charset/collation/flag"},
186 {_CS_COLLMAP, "charsets/charset/collation/map"},
187
188 /* Special purpose commands */
189 {_CS_UCA_VERSION, "charsets/charset/collation/version"},
190 {_CS_CL_SUPPRESS_CONTRACTIONS, "charsets/charset/collation/suppress_contractions"},
191 {_CS_CL_OPTIMIZE, "charsets/charset/collation/optimize"},
192 {_CS_CL_SHIFT_AFTER_METHOD, "charsets/charset/collation/shift-after-method"},
193 {_CS_CL_RULES_IMPORT, "charsets/charset/collation/rules/import"},
194 {_CS_CL_RULES_IMPORT_SOURCE, "charsets/charset/collation/rules/import/source"},
195
196 /* Collation Settings */
197 {_CS_ST_SETTINGS, "charsets/charset/collation/settings"},
198 {_CS_ST_STRENGTH, "charsets/charset/collation/settings/strength"},
199 {_CS_ST_ALTERNATE, "charsets/charset/collation/settings/alternate"},
200 {_CS_ST_BACKWARDS, "charsets/charset/collation/settings/backwards"},
201 {_CS_ST_NORMALIZATION, "charsets/charset/collation/settings/normalization"},
202 {_CS_ST_CASE_LEVEL, "charsets/charset/collation/settings/caseLevel"},
203 {_CS_ST_CASE_FIRST, "charsets/charset/collation/settings/caseFirst"},
204 {_CS_ST_HIRAGANA_QUATERNARY, "charsets/charset/collation/settings/hiraganaQuaternary"},
205 {_CS_ST_NUMERIC, "charsets/charset/collation/settings/numeric"},
206 {_CS_ST_VARIABLE_TOP, "charsets/charset/collation/settings/variableTop"},
207 {_CS_ST_MATCH_BOUNDARIES, "charsets/charset/collation/settings/match-boundaries"},
208 {_CS_ST_MATCH_STYLE, "charsets/charset/collation/settings/match-style"},
209
210 /* Rules */
211 {_CS_RULES, "charsets/charset/collation/rules"},
212 {_CS_RESET, "charsets/charset/collation/rules/reset"},
213 {_CS_DIFF1, "charsets/charset/collation/rules/p"},
214 {_CS_DIFF2, "charsets/charset/collation/rules/s"},
215 {_CS_DIFF3, "charsets/charset/collation/rules/t"},
216 {_CS_DIFF4, "charsets/charset/collation/rules/q"},
217 {_CS_IDENTICAL, "charsets/charset/collation/rules/i"},
218
219 /* Rules: expansions */
220 {_CS_EXP_X, "charsets/charset/collation/rules/x"},
221 {_CS_EXP_EXTEND, "charsets/charset/collation/rules/x/extend"},
222 {_CS_EXP_DIFF1, "charsets/charset/collation/rules/x/p"},
223 {_CS_EXP_DIFF2, "charsets/charset/collation/rules/x/s"},
224 {_CS_EXP_DIFF3, "charsets/charset/collation/rules/x/t"},
225 {_CS_EXP_DIFF4, "charsets/charset/collation/rules/x/q"},
226 {_CS_EXP_IDENTICAL, "charsets/charset/collation/rules/x/i"},
227
228 /* Rules: previous context */
229 {_CS_CONTEXT, "charsets/charset/collation/rules/x/context"},
230
231 /* Rules: Abbreviating Ordering Specifications */
232 {_CS_A_DIFF1, "charsets/charset/collation/rules/pc"},
233 {_CS_A_DIFF2, "charsets/charset/collation/rules/sc"},
234 {_CS_A_DIFF3, "charsets/charset/collation/rules/tc"},
235 {_CS_A_DIFF4, "charsets/charset/collation/rules/qc"},
236 {_CS_A_IDENTICAL, "charsets/charset/collation/rules/ic"},
237
238 /* Rules: Placing Characters Before Others*/
239 {_CS_RESET_BEFORE, "charsets/charset/collation/rules/reset/before"},
240
241 /* Rules: Logical Reset Positions */
242 {_CS_RESET_FIRST_NON_IGNORABLE, "charsets/charset/collation/rules/reset/first_non_ignorable"},
243 {_CS_RESET_LAST_NON_IGNORABLE, "charsets/charset/collation/rules/reset/last_non_ignorable"},
244 {_CS_RESET_FIRST_PRIMARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_primary_ignorable"},
245 {_CS_RESET_LAST_PRIMARY_IGNORABLE, "charsets/charset/collation/rules/reset/last_primary_ignorable"},
246 {_CS_RESET_FIRST_SECONDARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_secondary_ignorable"},
247 {_CS_RESET_LAST_SECONDARY_IGNORABLE, "charsets/charset/collation/rules/reset/last_secondary_ignorable"},
248 {_CS_RESET_FIRST_TERTIARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_tertiary_ignorable"},
249 {_CS_RESET_LAST_TERTIARY_IGNORABLE, "charsets/charset/collation/rules/reset/last_tertiary_ignorable"},
250 {_CS_RESET_FIRST_TRAILING, "charsets/charset/collation/rules/reset/first_trailing"},
251 {_CS_RESET_LAST_TRAILING, "charsets/charset/collation/rules/reset/last_trailing"},
252 {_CS_RESET_FIRST_VARIABLE, "charsets/charset/collation/rules/reset/first_variable"},
253 {_CS_RESET_LAST_VARIABLE, "charsets/charset/collation/rules/reset/last_variable"},
254
255 {0, NULL}
256};
257
258static const struct my_cs_file_section_st
259*cs_file_sec(const char *attr, size_t len)
260{
261 const struct my_cs_file_section_st *s;
262 for (s=sec; s->str; s++)
263 {
264 if (!strncmp(attr, s->str, len) && s->str[len] == 0)
265 return s;
266 }
267 return NULL;
268}
269
270#define MY_CS_CSDESCR_SIZE 64
271#define MY_CS_TAILORING_SIZE (32*1024)
272#define MY_CS_UCA_VERSION_SIZE 64
273#define MY_CS_CONTEXT_SIZE 64
274
275typedef struct my_cs_file_info
276{
277 char csname[MY_CS_NAME_SIZE];
278 char name[MY_CS_NAME_SIZE];
279 uchar ctype[MY_CS_CTYPE_TABLE_SIZE];
280 uchar to_lower[MY_CS_TO_LOWER_TABLE_SIZE];
281 uchar to_upper[MY_CS_TO_UPPER_TABLE_SIZE];
282 uchar sort_order[MY_CS_SORT_ORDER_TABLE_SIZE];
283 uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE];
284 char comment[MY_CS_CSDESCR_SIZE];
285 char *tailoring;
286 size_t tailoring_length;
287 size_t tailoring_alloced_length;
288 char context[MY_CS_CONTEXT_SIZE];
289 struct charset_info_st cs;
290 MY_CHARSET_LOADER *loader;
291} MY_CHARSET_FILE;
292
293
294static void
295my_charset_file_reset_charset(MY_CHARSET_FILE *i)
296{
297 memset(&i->cs, 0, sizeof(i->cs));
298}
299
300
301static void
302my_charset_file_reset_collation(MY_CHARSET_FILE *i)
303{
304 i->tailoring_length= 0;
305 i->context[0]= '\0';
306}
307
308
309static void
310my_charset_file_init(MY_CHARSET_FILE *i)
311{
312 my_charset_file_reset_charset(i);
313 my_charset_file_reset_collation(i);
314 i->tailoring= NULL;
315 i->tailoring_alloced_length= 0;
316}
317
318
319static void
320my_charset_file_free(MY_CHARSET_FILE *i)
321{
322 i->loader->free(i->tailoring);
323}
324
325
326static int
327my_charset_file_tailoring_realloc(MY_CHARSET_FILE *i, size_t newlen)
328{
329 if (i->tailoring_alloced_length > newlen ||
330 (i->tailoring= i->loader->realloc(i->tailoring,
331 (i->tailoring_alloced_length=
332 (newlen + 32*1024)))))
333 {
334 return MY_XML_OK;
335 }
336 return MY_XML_ERROR;
337}
338
339
340static int fill_uchar(uchar *a,uint size,const char *str, size_t len)
341{
342 uint i= 0;
343 const char *s, *b, *e=str+len;
344
345 for (s=str ; s < e ; i++)
346 {
347 for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
348 b=s;
349 for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
350 if (s == b || i > size)
351 break;
352 a[i]= (uchar) strtoul(b,NULL,16);
353 }
354 return 0;
355}
356
357static int fill_uint16(uint16 *a,uint size,const char *str, size_t len)
358{
359 uint i= 0;
360
361 const char *s, *b, *e=str+len;
362 for (s=str ; s < e ; i++)
363 {
364 for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
365 b=s;
366 for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
367 if (s == b || i > size)
368 break;
369 a[i]= (uint16) strtol(b,NULL,16);
370 }
371 return 0;
372}
373
374
375
376
377static int
378tailoring_append(MY_XML_PARSER *st,
379 const char *fmt, size_t len, const char *attr)
380{
381 struct my_cs_file_info *i= (struct my_cs_file_info *) st->user_data;
382 size_t newlen= i->tailoring_length + len + 64; /* 64 for format */
383 if (MY_XML_OK == my_charset_file_tailoring_realloc(i, newlen))
384 {
385 char *dst= i->tailoring + i->tailoring_length;
386 sprintf(dst, fmt, (int) len, attr);
387 i->tailoring_length+= strlen(dst);
388 return MY_XML_OK;
389 }
390 return MY_XML_ERROR;
391}
392
393
394static int
395tailoring_append2(MY_XML_PARSER *st,
396 const char *fmt,
397 size_t len1, const char *attr1,
398 size_t len2, const char *attr2)
399{
400 struct my_cs_file_info *i= (struct my_cs_file_info *) st->user_data;
401 size_t newlen= i->tailoring_length + len1 + len2 + 64; /* 64 for format */
402 if (MY_XML_OK == my_charset_file_tailoring_realloc(i, newlen))
403 {
404 char *dst= i->tailoring + i->tailoring_length;
405 sprintf(dst, fmt, (int) len1, attr1, (int) len2, attr2);
406 i->tailoring_length+= strlen(dst);
407 return MY_XML_OK;
408 }
409 return MY_XML_ERROR;
410}
411
412
413static size_t
414scan_one_character(const char *s, const char *e, my_wc_t *wc)
415{
416 CHARSET_INFO *cs= &my_charset_utf8_general_ci;
417 if (s >= e)
418 return 0;
419
420 /* Escape sequence: \uXXXX */
421 if (s[0] == '\\' && s + 2 < e && s[1] == 'u' && my_isxdigit(cs, s[2]))
422 {
423 size_t len= 3; /* We have at least one digit */
424 for (s+= 3; s < e && my_isxdigit(cs, s[0]); s++, len++)
425 {
426 }
427 wc[0]= 0;
428 return len;
429 }
430 else if ((int8) s[0] > 0) /* 7-bit character */
431 {
432 wc[0]= 0;
433 return 1;
434 }
435 else /* Non-escaped character */
436 {
437 int rc= cs->cset->mb_wc(cs, wc, (uchar *) s, (uchar *) e);
438 if (rc > 0)
439 return (size_t) rc;
440 }
441 return 0;
442}
443
444
445static int
446tailoring_append_abbreviation(MY_XML_PARSER *st,
447 const char *fmt, size_t len, const char *attr)
448{
449 size_t clen;
450 const char *attrend= attr + len;
451 my_wc_t wc;
452
453 for ( ; (clen= scan_one_character(attr, attrend, &wc)) > 0; attr+= clen)
454 {
455 DBUG_ASSERT(attr < attrend);
456 if (tailoring_append(st, fmt, clen, attr) != MY_XML_OK)
457 return MY_XML_ERROR;
458 }
459 return MY_XML_OK;
460}
461
462
463static int cs_enter(MY_XML_PARSER *st,const char *attr, size_t len)
464{
465 struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
466 const struct my_cs_file_section_st *s= cs_file_sec(attr,len);
467 int state= s ? s->state : 0;
468
469 switch (state) {
470 case 0:
471 i->loader->reporter(WARNING_LEVEL, "Unknown LDML tag: '%.*s'", len, attr);
472 break;
473
474 case _CS_CHARSET:
475 my_charset_file_reset_charset(i);
476 break;
477
478 case _CS_COLLATION:
479 my_charset_file_reset_collation(i);
480 break;
481
482 case _CS_RESET:
483 return tailoring_append(st, " &", 0, NULL);
484
485 default:
486 break;
487 }
488 return MY_XML_OK;
489}
490
491
492static int cs_leave(MY_XML_PARSER *st,const char *attr, size_t len)
493{
494 struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
495 const struct my_cs_file_section_st *s= cs_file_sec(attr,len);
496 int state= s ? s->state : 0;
497 int rc;
498
499 switch(state){
500 case _CS_COLLATION:
501 if (i->tailoring_length)
502 i->cs.tailoring= i->tailoring;
503 rc= i->loader->add_collation ? i->loader->add_collation(&i->cs) : MY_XML_OK;
504 break;
505
506 /* Rules: Logical Reset Positions */
507 case _CS_RESET_FIRST_NON_IGNORABLE:
508 rc= tailoring_append(st, "[first non-ignorable]", 0, NULL);
509 break;
510
511 case _CS_RESET_LAST_NON_IGNORABLE:
512 rc= tailoring_append(st, "[last non-ignorable]", 0, NULL);
513 break;
514
515 case _CS_RESET_FIRST_PRIMARY_IGNORABLE:
516 rc= tailoring_append(st, "[first primary ignorable]", 0, NULL);
517 break;
518
519 case _CS_RESET_LAST_PRIMARY_IGNORABLE:
520 rc= tailoring_append(st, "[last primary ignorable]", 0, NULL);
521 break;
522
523 case _CS_RESET_FIRST_SECONDARY_IGNORABLE:
524 rc= tailoring_append(st, "[first secondary ignorable]", 0, NULL);
525 break;
526
527 case _CS_RESET_LAST_SECONDARY_IGNORABLE:
528 rc= tailoring_append(st, "[last secondary ignorable]", 0, NULL);
529 break;
530
531 case _CS_RESET_FIRST_TERTIARY_IGNORABLE:
532 rc= tailoring_append(st, "[first tertiary ignorable]", 0, NULL);
533 break;
534
535 case _CS_RESET_LAST_TERTIARY_IGNORABLE:
536 rc= tailoring_append(st, "[last tertiary ignorable]", 0, NULL);
537 break;
538
539 case _CS_RESET_FIRST_TRAILING:
540 rc= tailoring_append(st, "[first trailing]", 0, NULL);
541 break;
542
543 case _CS_RESET_LAST_TRAILING:
544 rc= tailoring_append(st, "[last trailing]", 0, NULL);
545 break;
546
547 case _CS_RESET_FIRST_VARIABLE:
548 rc= tailoring_append(st, "[first variable]", 0, NULL);
549 break;
550
551 case _CS_RESET_LAST_VARIABLE:
552 rc= tailoring_append(st, "[last variable]", 0, NULL);
553 break;
554
555 default:
556 rc=MY_XML_OK;
557 }
558 return rc;
559}
560
561
562static const char *diff_fmt[5]=
563{
564 "<%.*s",
565 "<<%.*s",
566 "<<<%.*s",
567 "<<<<%.*s",
568 "=%.*s"
569};
570
571
572static const char *context_diff_fmt[5]=
573{
574 "<%.*s|%.*s",
575 "<<%.*s|%.*s",
576 "<<<%.*s|%.*s",
577 "<<<<%.*s|%.*s",
578 "=%.*s|%.*s"
579};
580
581
582static int cs_value(MY_XML_PARSER *st,const char *attr, size_t len)
583{
584 struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
585 const struct my_cs_file_section_st *s;
586 int state= (int)((s= cs_file_sec(st->attr.start,
587 st->attr.end - st->attr.start)) ?
588 s->state : 0);
589 int rc= MY_XML_OK;
590
591 switch (state) {
592 case _CS_MISC:
593 case _CS_FAMILY:
594 case _CS_ORDER:
595 break;
596 case _CS_ID:
597 i->cs.number= strtol(attr,(char**)NULL,10);
598 break;
599 case _CS_BINARY_ID:
600 i->cs.binary_number= strtol(attr,(char**)NULL,10);
601 break;
602 case _CS_PRIMARY_ID:
603 i->cs.primary_number= strtol(attr,(char**)NULL,10);
604 break;
605 case _CS_COLNAME:
606 i->cs.name=mstr(i->name,attr,len,MY_CS_NAME_SIZE-1);
607 break;
608 case _CS_CSNAME:
609 i->cs.csname=mstr(i->csname,attr,len,MY_CS_NAME_SIZE-1);
610 break;
611 case _CS_CSDESCRIPT:
612 i->cs.comment=mstr(i->comment,attr,len,MY_CS_CSDESCR_SIZE-1);
613 break;
614 case _CS_FLAG:
615 if (!strncmp("primary",attr,len))
616 i->cs.state|= MY_CS_PRIMARY;
617 else if (!strncmp("binary",attr,len))
618 i->cs.state|= MY_CS_BINSORT;
619 else if (!strncmp("compiled",attr,len))
620 i->cs.state|= MY_CS_COMPILED;
621 else if (!strncmp("nopad",attr,len))
622 i->cs.state|= MY_CS_NOPAD;
623 break;
624 case _CS_UPPERMAP:
625 fill_uchar(i->to_upper,MY_CS_TO_UPPER_TABLE_SIZE,attr,len);
626 i->cs.to_upper=i->to_upper;
627 break;
628 case _CS_LOWERMAP:
629 fill_uchar(i->to_lower,MY_CS_TO_LOWER_TABLE_SIZE,attr,len);
630 i->cs.to_lower=i->to_lower;
631 break;
632 case _CS_UNIMAP:
633 fill_uint16(i->tab_to_uni,MY_CS_TO_UNI_TABLE_SIZE,attr,len);
634 i->cs.tab_to_uni=i->tab_to_uni;
635 break;
636 case _CS_COLLMAP:
637 fill_uchar(i->sort_order,MY_CS_SORT_ORDER_TABLE_SIZE,attr,len);
638 i->cs.sort_order=i->sort_order;
639 break;
640 case _CS_CTYPEMAP:
641 fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len);
642 i->cs.ctype=i->ctype;
643 break;
644
645 /* Special purpose commands */
646 case _CS_UCA_VERSION:
647 rc= tailoring_append(st, "[version %.*s]", len, attr);
648 break;
649
650 case _CS_CL_RULES_IMPORT_SOURCE:
651 rc= tailoring_append(st, "[import %.*s]", len, attr);
652 break;
653
654 case _CS_CL_SUPPRESS_CONTRACTIONS:
655 rc= tailoring_append(st, "[suppress contractions %.*s]", len, attr);
656 break;
657
658 case _CS_CL_OPTIMIZE:
659 rc= tailoring_append(st, "[optimize %.*s]", len, attr);
660 break;
661
662 case _CS_CL_SHIFT_AFTER_METHOD:
663 rc= tailoring_append(st, "[shift-after-method %.*s]", len, attr);
664 break;
665
666 /* Collation Settings */
667 case _CS_ST_STRENGTH:
668 /* 1, 2, 3, 4, 5, or primary, secondary, tertiary, quaternary, identical */
669 rc= tailoring_append(st, "[strength %.*s]", len, attr);
670 if (len && attr[0] >= '1' && attr[0] <= '9')
671 i->cs.levels_for_order= attr[0] - '0';
672 break;
673
674 case _CS_ST_ALTERNATE:
675 /* non-ignorable, shifted */
676 rc= tailoring_append(st, "[alternate %.*s]", len, attr);
677 break;
678
679 case _CS_ST_BACKWARDS:
680 /* on, off, 2 */
681 rc= tailoring_append(st, "[backwards %.*s]", len, attr);
682 break;
683
684 case _CS_ST_NORMALIZATION:
685 /*
686 TODO for WL#896: check collations for normalization: vi.xml
687 We want precomposed characters work well at this point.
688 */
689 /* on, off */
690 rc= tailoring_append(st, "[normalization %.*s]", len, attr);
691 break;
692
693 case _CS_ST_CASE_LEVEL:
694 /* on, off */
695 rc= tailoring_append(st, "[caseLevel %.*s]", len, attr);
696 break;
697
698 case _CS_ST_CASE_FIRST:
699 /* upper, lower, off */
700 rc= tailoring_append(st, "[caseFirst %.*s]", len, attr);
701 break;
702
703 case _CS_ST_HIRAGANA_QUATERNARY:
704 /* on, off */
705 rc= tailoring_append(st, "[hiraganaQ %.*s]", len, attr);
706 break;
707
708 case _CS_ST_NUMERIC:
709 /* on, off */
710 rc= tailoring_append(st, "[numeric %.*s]", len, attr);
711 break;
712
713 case _CS_ST_VARIABLE_TOP:
714 /* TODO for WL#896: check value format */
715 rc= tailoring_append(st, "[variableTop %.*s]", len, attr);
716 break;
717
718 case _CS_ST_MATCH_BOUNDARIES:
719 /* none, whole-character, whole-word */
720 rc= tailoring_append(st, "[match-boundaries %.*s]", len, attr);
721 break;
722
723 case _CS_ST_MATCH_STYLE:
724 /* minimal, medial, maximal */
725 rc= tailoring_append(st, "[match-style %.*s]", len, attr);
726 break;
727
728
729 /* Rules */
730 case _CS_RESET:
731 rc= tailoring_append(st, "%.*s", len, attr);
732 break;
733
734 case _CS_DIFF1:
735 case _CS_DIFF2:
736 case _CS_DIFF3:
737 case _CS_DIFF4:
738 case _CS_IDENTICAL:
739 rc= tailoring_append(st, diff_fmt[state - _CS_DIFF1], len, attr);
740 break;
741
742
743 /* Rules: Expansion */
744 case _CS_EXP_EXTEND:
745 rc= tailoring_append(st, " / %.*s", len, attr);
746 break;
747
748 case _CS_EXP_DIFF1:
749 case _CS_EXP_DIFF2:
750 case _CS_EXP_DIFF3:
751 case _CS_EXP_DIFF4:
752 case _CS_EXP_IDENTICAL:
753 if (i->context[0])
754 {
755 rc= tailoring_append2(st, context_diff_fmt[state - _CS_EXP_DIFF1],
756 strlen(i->context), i->context, len, attr);
757 i->context[0]= 0;
758 }
759 else
760 rc= tailoring_append(st, diff_fmt[state - _CS_EXP_DIFF1], len, attr);
761 break;
762
763 /* Rules: Context */
764 case _CS_CONTEXT:
765 if (len < sizeof(i->context))
766 {
767 memcpy(i->context, attr, len);
768 i->context[len]= '\0';
769 }
770 break;
771
772 /* Rules: Abbreviating Ordering Specifications */
773 case _CS_A_DIFF1:
774 case _CS_A_DIFF2:
775 case _CS_A_DIFF3:
776 case _CS_A_DIFF4:
777 case _CS_A_IDENTICAL:
778 rc= tailoring_append_abbreviation(st, diff_fmt[state - _CS_A_DIFF1], len, attr);
779 break;
780
781 /* Rules: Placing Characters Before Others */
782 case _CS_RESET_BEFORE:
783 /*
784 TODO for WL#896: Add this check into text customization parser:
785 It is an error if the strength of the before relation is not identical
786 to the relation after the reset. We'll need this for WL#896.
787 */
788 rc= tailoring_append(st, "[before %.*s]", len, attr);
789 break;
790
791
792 default:
793 break;
794 }
795
796 return rc;
797}
798
799
800my_bool
801my_parse_charset_xml(MY_CHARSET_LOADER *loader, const char *buf, size_t len)
802{
803 MY_XML_PARSER p;
804 struct my_cs_file_info info;
805 my_bool rc;
806
807 my_charset_file_init(&info);
808 my_xml_parser_create(&p);
809 my_xml_set_enter_handler(&p,cs_enter);
810 my_xml_set_value_handler(&p,cs_value);
811 my_xml_set_leave_handler(&p,cs_leave);
812 info.loader= loader;
813 my_xml_set_user_data(&p, (void *) &info);
814 rc= (my_xml_parse(&p,buf,len) == MY_XML_OK) ? FALSE : TRUE;
815 my_xml_parser_free(&p);
816 my_charset_file_free(&info);
817 if (rc != MY_XML_OK)
818 {
819 const char *errstr= my_xml_error_string(&p);
820 if (sizeof(loader->error) > 32 + strlen(errstr))
821 {
822 /* We cannot use my_snprintf() here. See previous comment. */
823 sprintf(loader->error, "at line %d pos %d: %s",
824 my_xml_error_lineno(&p)+1,
825 (int) my_xml_error_pos(&p),
826 my_xml_error_string(&p));
827 }
828 }
829 return rc;
830}
831
832
833uint
834my_string_repertoire_8bit(CHARSET_INFO *cs, const char *str, size_t length)
835{
836 const char *strend;
837 if ((cs->state & MY_CS_NONASCII) && length > 0)
838 return MY_REPERTOIRE_UNICODE30;
839 for (strend= str + length; str < strend; str++)
840 {
841 if (((uchar) *str) > 0x7F)
842 return MY_REPERTOIRE_UNICODE30;
843 }
844 return MY_REPERTOIRE_ASCII;
845}
846
847
848static void
849my_string_metadata_init(MY_STRING_METADATA *metadata)
850{
851 metadata->repertoire= MY_REPERTOIRE_ASCII;
852 metadata->char_length= 0;
853}
854
855
856/**
857 This should probably eventually go as a virtual function into
858 MY_CHARSET_HANDLER or MY_COLLATION_HANDLER.
859*/
860static void
861my_string_metadata_get_mb(MY_STRING_METADATA *metadata,
862 CHARSET_INFO *cs, const char *str, ulong length)
863{
864 const char *strend= str + length;
865 for (my_string_metadata_init(metadata) ;
866 str < strend;
867 metadata->char_length++)
868 {
869 my_wc_t wc;
870 int mblen= cs->cset->mb_wc(cs, &wc, (const uchar *) str,
871 (const uchar *) strend);
872 if (mblen > 0) /* Assigned character */
873 {
874 if (wc > 0x7F)
875 metadata->repertoire|= MY_REPERTOIRE_EXTENDED;
876 str+= mblen;
877 }
878 else if (mblen == MY_CS_ILSEQ) /* Bad byte sequence */
879 {
880 metadata->repertoire|= MY_REPERTOIRE_EXTENDED;
881 str++;
882 }
883 else if (mblen > MY_CS_TOOSMALL) /* Unassigned character */
884 {
885 metadata->repertoire|= MY_REPERTOIRE_EXTENDED;
886 str+= (-mblen);
887 }
888 else /* Incomplete character, premature end-of-line */
889 {
890 metadata->repertoire|= MY_REPERTOIRE_EXTENDED; /* Just in case */
891 break;
892 }
893 }
894}
895
896
897/**
898 Collect string metadata: length in characters and repertoire.
899*/
900void
901my_string_metadata_get(MY_STRING_METADATA *metadata,
902 CHARSET_INFO *cs, const char *str, size_t length)
903{
904 if (cs->mbmaxlen == 1 && !(cs->state & MY_CS_NONASCII))
905 {
906 metadata->char_length= length;
907 metadata->repertoire= my_string_repertoire_8bit(cs, str, (ulong)length);
908 }
909 else
910 {
911 my_string_metadata_get_mb(metadata, cs, str, (ulong)length);
912 }
913}
914
915
916/*
917 Check repertoire: detect pure ascii strings
918*/
919uint
920my_string_repertoire(CHARSET_INFO *cs, const char *str, size_t length)
921{
922 if (cs->mbminlen == 1 && !(cs->state & MY_CS_NONASCII))
923 {
924 return my_string_repertoire_8bit(cs, str, length);
925 }
926 else
927 {
928 const char *strend= str + length;
929 my_wc_t wc;
930 int chlen;
931 for (;
932 (chlen= cs->cset->mb_wc(cs, &wc, (uchar*) str, (uchar*) strend)) > 0;
933 str+= chlen)
934 {
935 if (wc > 0x7F)
936 return MY_REPERTOIRE_UNICODE30;
937 }
938 }
939 return MY_REPERTOIRE_ASCII;
940}
941
942
943/*
944 Returns repertoire for charset
945*/
946uint my_charset_repertoire(CHARSET_INFO *cs)
947{
948 return cs->state & MY_CS_PUREASCII ?
949 MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
950}
951
952
953/*
954 Detect whether a character set is ASCII compatible.
955
956 Returns TRUE for:
957
958 - all 8bit character sets whose Unicode mapping of 0x7B is '{'
959 (ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS")
960
961 - all multi-byte character sets having mbminlen == 1
962 (ignores ucs2 whose mbminlen is 2)
963
964 TODO:
965
966 When merging to 5.2, this function should be changed
967 to check a new flag MY_CS_NONASCII,
968
969 return (cs->flag & MY_CS_NONASCII) ? 0 : 1;
970
971 This flag was previously added into 5.2 under terms
972 of WL#3759 "Optimize identifier conversion in client-server protocol"
973 especially to mark character sets not compatible with ASCII.
974
975 We won't backport this flag to 5.0 or 5.1.
976 This function is Ok for 5.0 and 5.1, because we're not going
977 to introduce new tricky character sets between 5.0 and 5.2.
978*/
979my_bool
980my_charset_is_ascii_based(CHARSET_INFO *cs)
981{
982 return
983 (cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') ||
984 (cs->mbminlen == 1 && cs->mbmaxlen > 1);
985}
986
987
988/*
989 Convert a string between two character sets.
990 'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
991
992 @param to[OUT] Store result here
993 @param to_length Size of "to" buffer
994 @param to_cs Character set of result string
995 @param from Copy from here
996 @param from_length Length of the "from" string
997 @param from_cs Character set of the "from" string
998 @param errors[OUT] Number of conversion errors
999
1000 @return Number of bytes copied to 'to' string
1001*/
1002
1003uint32
1004my_convert_using_func(char *to, size_t to_length,
1005 CHARSET_INFO *to_cs, my_charset_conv_wc_mb wc_mb,
1006 const char *from, size_t from_length,
1007 CHARSET_INFO *from_cs, my_charset_conv_mb_wc mb_wc,
1008 uint *errors)
1009{
1010 int cnvres;
1011 my_wc_t wc;
1012 const uchar *from_end= (const uchar*) from + from_length;
1013 char *to_start= to;
1014 uchar *to_end= (uchar*) to + to_length;
1015 uint error_count= 0;
1016
1017 while (1)
1018 {
1019 if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
1020 from+= cnvres;
1021 else if (cnvres == MY_CS_ILSEQ)
1022 {
1023 error_count++;
1024 from++;
1025 wc= '?';
1026 }
1027 else if (cnvres > MY_CS_TOOSMALL)
1028 {
1029 /*
1030 A correct multibyte sequence detected
1031 But it doesn't have Unicode mapping.
1032 */
1033 error_count++;
1034 from+= (-cnvres);
1035 wc= '?';
1036 }
1037 else
1038 {
1039 if ((uchar *) from >= from_end)
1040 break; /* End of line */
1041 /* Incomplete byte sequence */
1042 error_count++;
1043 from++;
1044 wc= '?';
1045 }
1046
1047outp:
1048 if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
1049 to+= cnvres;
1050 else if (cnvres == MY_CS_ILUNI && wc != '?')
1051 {
1052 error_count++;
1053 wc= '?';
1054 goto outp;
1055 }
1056 else
1057 break;
1058 }
1059 *errors= error_count;
1060 return (uint32) (to - to_start);
1061}
1062
1063
1064/*
1065 Convert a string between two character sets.
1066 Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
1067 'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
1068
1069 @param to[OUT] Store result here
1070 @param to_length Size of "to" buffer
1071 @param to_cs Character set of result string
1072 @param from Copy from here
1073 @param from_length Length of the "from" string
1074 @param from_cs Character set of the "from" string
1075 @param errors[OUT] Number of conversion errors
1076
1077 @return Number of bytes copied to 'to' string
1078*/
1079
1080uint32
1081my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
1082 const char *from, uint32 from_length,
1083 CHARSET_INFO *from_cs, uint *errors)
1084{
1085 uint32 length, length2;
1086 /*
1087 If any of the character sets is not ASCII compatible,
1088 immediately switch to slow mb_wc->wc_mb method.
1089 */
1090 if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
1091 return my_convert_using_func(to, to_length,
1092 to_cs, to_cs->cset->wc_mb,
1093 from, from_length,
1094 from_cs, from_cs->cset->mb_wc,
1095 errors);
1096
1097 length= length2= MY_MIN(to_length, from_length);
1098
1099#if defined(__i386__) || defined(__x86_64__)
1100 /*
1101 Special loop for i386, it allows to refer to a
1102 non-aligned memory block as UINT32, which makes
1103 it possible to copy four bytes at once. This
1104 gives about 10% performance improvement comparing
1105 to byte-by-byte loop.
1106 */
1107 for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
1108 {
1109 if ((*(uint32*)from) & 0x80808080)
1110 break;
1111 *((uint32*) to)= *((const uint32*) from);
1112 }
1113#endif /* __i386__ */
1114
1115 for (; ; *to++= *from++, length--)
1116 {
1117 if (!length)
1118 {
1119 *errors= 0;
1120 return length2;
1121 }
1122 if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
1123 {
1124 uint32 copied_length= length2 - length;
1125 to_length-= copied_length;
1126 from_length-= copied_length;
1127 return copied_length + my_convert_using_func(to, to_length, to_cs,
1128 to_cs->cset->wc_mb,
1129 from, from_length, from_cs,
1130 from_cs->cset->mb_wc,
1131 errors);
1132 }
1133 }
1134
1135 DBUG_ASSERT(FALSE); // Should never get to here
1136 return 0; // Make compiler happy
1137}
1138
1139
1140size_t
1141my_convert_fix(CHARSET_INFO *to_cs, char *to, size_t to_length,
1142 CHARSET_INFO *from_cs, const char *from, size_t from_length,
1143 size_t nchars,
1144 MY_STRCOPY_STATUS *copy_status,
1145 MY_STRCONV_STATUS *conv_status)
1146{
1147 int cnvres;
1148 my_wc_t wc;
1149 my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
1150 my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
1151 const uchar *from_end= (const uchar*) from + from_length;
1152 uchar *to_end= (uchar*) to + to_length;
1153 char *to_start= to;
1154
1155 DBUG_ASSERT(to_cs != &my_charset_bin);
1156 DBUG_ASSERT(from_cs != &my_charset_bin);
1157
1158 copy_status->m_well_formed_error_pos= NULL;
1159 conv_status->m_cannot_convert_error_pos= NULL;
1160
1161 for ( ; nchars; nchars--)
1162 {
1163 const char *from_prev= from;
1164 if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
1165 from+= cnvres;
1166 else if (cnvres == MY_CS_ILSEQ)
1167 {
1168 if (!copy_status->m_well_formed_error_pos)
1169 copy_status->m_well_formed_error_pos= from;
1170 from++;
1171 wc= '?';
1172 }
1173 else if (cnvres > MY_CS_TOOSMALL)
1174 {
1175 /*
1176 A correct multibyte sequence detected
1177 But it doesn't have Unicode mapping.
1178 */
1179 if (!conv_status->m_cannot_convert_error_pos)
1180 conv_status->m_cannot_convert_error_pos= from;
1181 from+= (-cnvres);
1182 wc= '?';
1183 }
1184 else
1185 {
1186 if ((uchar *) from >= from_end)
1187 break; // End of line
1188 // Incomplete byte sequence
1189 if (!copy_status->m_well_formed_error_pos)
1190 copy_status->m_well_formed_error_pos= from;
1191 from++;
1192 wc= '?';
1193 }
1194outp:
1195 if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
1196 to+= cnvres;
1197 else if (cnvres == MY_CS_ILUNI && wc != '?')
1198 {
1199 if (!conv_status->m_cannot_convert_error_pos)
1200 conv_status->m_cannot_convert_error_pos= from_prev;
1201 wc= '?';
1202 goto outp;
1203 }
1204 else
1205 {
1206 from= from_prev;
1207 break;
1208 }
1209 }
1210 copy_status->m_source_end_pos= from;
1211 return to - to_start;
1212}
1213