| 1 | /* Copyright (c) 2000, 2013, Oracle and/or its affiliates. | 
| 2 |    Copyright (c) 2009, 2014, SkySQL Ab. | 
| 3 |  | 
| 4 |    This program is free software; you can redistribute it and/or modify | 
| 5 |    it under the terms of the GNU General Public License as published by | 
| 6 |    the Free Software Foundation; version 2 of the License. | 
| 7 |  | 
| 8 |    This program is distributed in the hope that it will be useful, | 
| 9 |    but WITHOUT ANY WARRANTY; without even the implied warranty of | 
| 10 |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
| 11 |    GNU General Public License for more details. | 
| 12 |  | 
| 13 |    You should have received a copy of the GNU General Public License | 
| 14 |    along with this program; if not, write to the Free Software | 
| 15 |    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */ | 
| 16 |  | 
| 17 | #include "strings_def.h" | 
| 18 | #include <m_ctype.h> | 
| 19 | #include <my_xml.h> | 
| 20 |  | 
| 21 | /* | 
| 22 |  | 
| 23 |   This files implements routines which parse XML based | 
| 24 |   character set and collation description files. | 
| 25 |    | 
| 26 |   Unicode collations are encoded according to | 
| 27 |    | 
| 28 |     Unicode Technical Standard #35 | 
| 29 |     Locale Data Markup Language (LDML) | 
| 30 |     http://www.unicode.org/reports/tr35/ | 
| 31 |    | 
| 32 |   and converted into ICU string according to | 
| 33 |    | 
| 34 |     Collation Customization | 
| 35 |     http://oss.software.ibm.com/icu/userguide/Collate_Customization.html | 
| 36 |    | 
| 37 | */ | 
| 38 |  | 
| 39 |  | 
| 40 | /* | 
| 41 |   Avoid using my_snprintf | 
| 42 |   We cannot use my_snprintf() here, because ctype.o is | 
| 43 |   used to build conf_to_src, which must require minimum | 
| 44 |   dependency. | 
| 45 | */ | 
| 46 |  | 
| 47 | #undef my_snprinf | 
| 48 | #define my_snprintf "We cannot use my_snprintf in this file" | 
| 49 |  | 
| 50 |  | 
| 51 | int (*my_string_stack_guard)(int)= NULL; | 
| 52 |  | 
| 53 | static char *mstr(char *str,const char *src,size_t l1,size_t l2) | 
| 54 | { | 
| 55 |   l1= l1<l2 ? l1 : l2; | 
| 56 |   memcpy(str,src,l1); | 
| 57 |   str[l1]='\0'; | 
| 58 |   return str; | 
| 59 | } | 
| 60 |  | 
| 61 | struct my_cs_file_section_st | 
| 62 | { | 
| 63 |   int        state; | 
| 64 |   const char *str; | 
| 65 | }; | 
| 66 |  | 
| 67 | #define _CS_MISC	1 | 
| 68 | #define _CS_ID		2 | 
| 69 | #define _CS_CSNAME	3 | 
| 70 | #define _CS_FAMILY	4 | 
| 71 | #define _CS_ORDER	5 | 
| 72 | #define _CS_COLNAME	6 | 
| 73 | #define _CS_FLAG	7 | 
| 74 | #define _CS_CHARSET	8 | 
| 75 | #define _CS_COLLATION	9 | 
| 76 | #define _CS_UPPERMAP	10 | 
| 77 | #define _CS_LOWERMAP	11 | 
| 78 | #define _CS_UNIMAP	12 | 
| 79 | #define _CS_COLLMAP	13 | 
| 80 | #define _CS_CTYPEMAP	14 | 
| 81 | #define _CS_PRIMARY_ID	15 | 
| 82 | #define _CS_BINARY_ID	16 | 
| 83 | #define _CS_CSDESCRIPT	17 | 
| 84 |  | 
| 85 |  | 
| 86 | /* Special purpose commands */ | 
| 87 | #define _CS_UCA_VERSION                 100 | 
| 88 | #define _CS_CL_SUPPRESS_CONTRACTIONS    101 | 
| 89 | #define _CS_CL_OPTIMIZE                 102 | 
| 90 | #define _CS_CL_SHIFT_AFTER_METHOD       103 | 
| 91 | #define _CS_CL_RULES_IMPORT             104 | 
| 92 | #define _CS_CL_RULES_IMPORT_SOURCE      105 | 
| 93 |  | 
| 94 |  | 
| 95 | /* Collation Settings */ | 
| 96 | #define _CS_ST_SETTINGS                 200 | 
| 97 | #define _CS_ST_STRENGTH                 201 | 
| 98 | #define _CS_ST_ALTERNATE                202 | 
| 99 | #define _CS_ST_BACKWARDS                203 | 
| 100 | #define _CS_ST_NORMALIZATION            204 | 
| 101 | #define _CS_ST_CASE_LEVEL               205 | 
| 102 | #define _CS_ST_CASE_FIRST               206 | 
| 103 | #define _CS_ST_HIRAGANA_QUATERNARY      207 | 
| 104 | #define _CS_ST_NUMERIC                  208 | 
| 105 | #define _CS_ST_VARIABLE_TOP             209 | 
| 106 | #define _CS_ST_MATCH_BOUNDARIES         210 | 
| 107 | #define _CS_ST_MATCH_STYLE              211 | 
| 108 |  | 
| 109 |  | 
| 110 | /* Rules */ | 
| 111 | #define _CS_RULES                       300 | 
| 112 | #define _CS_RESET                       301 | 
| 113 | #define _CS_DIFF1                       302 | 
| 114 | #define _CS_DIFF2                       303 | 
| 115 | #define _CS_DIFF3                       304 | 
| 116 | #define _CS_DIFF4                       305 | 
| 117 | #define _CS_IDENTICAL                   306 | 
| 118 |  | 
| 119 | /* Rules: Expansions */ | 
| 120 | #define _CS_EXP_X                       320 | 
| 121 | #define _CS_EXP_EXTEND                  321 | 
| 122 | #define _CS_EXP_DIFF1                   322 | 
| 123 | #define _CS_EXP_DIFF2                   323 | 
| 124 | #define _CS_EXP_DIFF3                   324 | 
| 125 | #define _CS_EXP_DIFF4                   325 | 
| 126 | #define _CS_EXP_IDENTICAL               326 | 
| 127 |  | 
| 128 | /* Rules: Abbreviating Ordering Specifications */ | 
| 129 | #define _CS_A_DIFF1                     351 | 
| 130 | #define _CS_A_DIFF2                     352 | 
| 131 | #define _CS_A_DIFF3                     353 | 
| 132 | #define _CS_A_DIFF4                     354 | 
| 133 | #define _CS_A_IDENTICAL                 355 | 
| 134 |  | 
| 135 | /* Rules: previous context */ | 
| 136 | #define _CS_CONTEXT                     370 | 
| 137 |  | 
| 138 | /* Rules: Placing Characters Before Others*/ | 
| 139 | #define _CS_RESET_BEFORE 380 | 
| 140 |  | 
| 141 | /* Rules: Logical Reset Positions */ | 
| 142 | #define _CS_RESET_FIRST_PRIMARY_IGNORABLE     401 | 
| 143 | #define _CS_RESET_LAST_PRIMARY_IGNORABLE      402 | 
| 144 | #define _CS_RESET_FIRST_SECONDARY_IGNORABLE   403 | 
| 145 | #define _CS_RESET_LAST_SECONDARY_IGNORABLE    404 | 
| 146 | #define _CS_RESET_FIRST_TERTIARY_IGNORABLE    405 | 
| 147 | #define _CS_RESET_LAST_TERTIARY_IGNORABLE     406 | 
| 148 | #define _CS_RESET_FIRST_TRAILING              407 | 
| 149 | #define _CS_RESET_LAST_TRAILING               408 | 
| 150 | #define _CS_RESET_FIRST_VARIABLE              409 | 
| 151 | #define _CS_RESET_LAST_VARIABLE               410 | 
| 152 | #define _CS_RESET_FIRST_NON_IGNORABLE         411 | 
| 153 | #define _CS_RESET_LAST_NON_IGNORABLE          412 | 
| 154 |  | 
| 155 |  | 
| 156 |  | 
| 157 | static const struct my_cs_file_section_st sec[] = | 
| 158 | { | 
| 159 |   {_CS_MISC,		"xml" }, | 
| 160 |   {_CS_MISC,		"xml/version" }, | 
| 161 |   {_CS_MISC,		"xml/encoding" }, | 
| 162 |   {_CS_MISC,		"charsets" }, | 
| 163 |   {_CS_MISC,		"charsets/max-id" }, | 
| 164 |   {_CS_MISC,		"charsets/copyright" }, | 
| 165 |   {_CS_MISC,		"charsets/description" }, | 
| 166 |   {_CS_CHARSET,		"charsets/charset" }, | 
| 167 |   {_CS_PRIMARY_ID,	"charsets/charset/primary-id" }, | 
| 168 |   {_CS_BINARY_ID,	"charsets/charset/binary-id" }, | 
| 169 |   {_CS_CSNAME,		"charsets/charset/name" }, | 
| 170 |   {_CS_FAMILY,		"charsets/charset/family" }, | 
| 171 |   {_CS_CSDESCRIPT,	"charsets/charset/description" }, | 
| 172 |   {_CS_MISC,		"charsets/charset/alias" }, | 
| 173 |   {_CS_MISC,		"charsets/charset/ctype" }, | 
| 174 |   {_CS_CTYPEMAP,	"charsets/charset/ctype/map" }, | 
| 175 |   {_CS_MISC,		"charsets/charset/upper" }, | 
| 176 |   {_CS_UPPERMAP,	"charsets/charset/upper/map" }, | 
| 177 |   {_CS_MISC,		"charsets/charset/lower" }, | 
| 178 |   {_CS_LOWERMAP,	"charsets/charset/lower/map" }, | 
| 179 |   {_CS_MISC,		"charsets/charset/unicode" }, | 
| 180 |   {_CS_UNIMAP,		"charsets/charset/unicode/map" }, | 
| 181 |   {_CS_COLLATION,	"charsets/charset/collation" }, | 
| 182 |   {_CS_COLNAME,		"charsets/charset/collation/name" }, | 
| 183 |   {_CS_ID,		"charsets/charset/collation/id" }, | 
| 184 |   {_CS_ORDER,		"charsets/charset/collation/order" }, | 
| 185 |   {_CS_FLAG,		"charsets/charset/collation/flag" }, | 
| 186 |   {_CS_COLLMAP,		"charsets/charset/collation/map" }, | 
| 187 |  | 
| 188 |   /* Special purpose commands */ | 
| 189 |   {_CS_UCA_VERSION,              "charsets/charset/collation/version" }, | 
| 190 |   {_CS_CL_SUPPRESS_CONTRACTIONS, "charsets/charset/collation/suppress_contractions" }, | 
| 191 |   {_CS_CL_OPTIMIZE,              "charsets/charset/collation/optimize" }, | 
| 192 |   {_CS_CL_SHIFT_AFTER_METHOD,    "charsets/charset/collation/shift-after-method" }, | 
| 193 |   {_CS_CL_RULES_IMPORT,          "charsets/charset/collation/rules/import" }, | 
| 194 |   {_CS_CL_RULES_IMPORT_SOURCE,   "charsets/charset/collation/rules/import/source" }, | 
| 195 |  | 
| 196 |   /* Collation Settings */ | 
| 197 |   {_CS_ST_SETTINGS,              "charsets/charset/collation/settings" }, | 
| 198 |   {_CS_ST_STRENGTH,              "charsets/charset/collation/settings/strength" }, | 
| 199 |   {_CS_ST_ALTERNATE,             "charsets/charset/collation/settings/alternate" }, | 
| 200 |   {_CS_ST_BACKWARDS,             "charsets/charset/collation/settings/backwards" }, | 
| 201 |   {_CS_ST_NORMALIZATION,         "charsets/charset/collation/settings/normalization" }, | 
| 202 |   {_CS_ST_CASE_LEVEL,            "charsets/charset/collation/settings/caseLevel" }, | 
| 203 |   {_CS_ST_CASE_FIRST,            "charsets/charset/collation/settings/caseFirst" }, | 
| 204 |   {_CS_ST_HIRAGANA_QUATERNARY,   "charsets/charset/collation/settings/hiraganaQuaternary" }, | 
| 205 |   {_CS_ST_NUMERIC,               "charsets/charset/collation/settings/numeric" }, | 
| 206 |   {_CS_ST_VARIABLE_TOP,          "charsets/charset/collation/settings/variableTop" }, | 
| 207 |   {_CS_ST_MATCH_BOUNDARIES,      "charsets/charset/collation/settings/match-boundaries" }, | 
| 208 |   {_CS_ST_MATCH_STYLE,           "charsets/charset/collation/settings/match-style" }, | 
| 209 |  | 
| 210 |   /* Rules */ | 
| 211 |   {_CS_RULES,           "charsets/charset/collation/rules" }, | 
| 212 |   {_CS_RESET,           "charsets/charset/collation/rules/reset" }, | 
| 213 |   {_CS_DIFF1,           "charsets/charset/collation/rules/p" }, | 
| 214 |   {_CS_DIFF2,           "charsets/charset/collation/rules/s" }, | 
| 215 |   {_CS_DIFF3,           "charsets/charset/collation/rules/t" }, | 
| 216 |   {_CS_DIFF4,           "charsets/charset/collation/rules/q" }, | 
| 217 |   {_CS_IDENTICAL,       "charsets/charset/collation/rules/i" }, | 
| 218 |  | 
| 219 |   /* Rules: expansions */ | 
| 220 |   {_CS_EXP_X,           "charsets/charset/collation/rules/x" }, | 
| 221 |   {_CS_EXP_EXTEND,      "charsets/charset/collation/rules/x/extend" }, | 
| 222 |   {_CS_EXP_DIFF1,       "charsets/charset/collation/rules/x/p" }, | 
| 223 |   {_CS_EXP_DIFF2,       "charsets/charset/collation/rules/x/s" }, | 
| 224 |   {_CS_EXP_DIFF3,       "charsets/charset/collation/rules/x/t" }, | 
| 225 |   {_CS_EXP_DIFF4,       "charsets/charset/collation/rules/x/q" }, | 
| 226 |   {_CS_EXP_IDENTICAL,   "charsets/charset/collation/rules/x/i" }, | 
| 227 |    | 
| 228 |   /* Rules: previous context */ | 
| 229 |   {_CS_CONTEXT,         "charsets/charset/collation/rules/x/context" }, | 
| 230 |  | 
| 231 |   /* Rules: Abbreviating Ordering Specifications */ | 
| 232 |   {_CS_A_DIFF1,         "charsets/charset/collation/rules/pc" }, | 
| 233 |   {_CS_A_DIFF2,         "charsets/charset/collation/rules/sc" }, | 
| 234 |   {_CS_A_DIFF3,         "charsets/charset/collation/rules/tc" }, | 
| 235 |   {_CS_A_DIFF4,         "charsets/charset/collation/rules/qc" }, | 
| 236 |   {_CS_A_IDENTICAL,     "charsets/charset/collation/rules/ic" }, | 
| 237 |  | 
| 238 |   /* Rules: Placing Characters Before Others*/ | 
| 239 |   {_CS_RESET_BEFORE,    "charsets/charset/collation/rules/reset/before" }, | 
| 240 |  | 
| 241 |   /* Rules: Logical Reset Positions */ | 
| 242 |   {_CS_RESET_FIRST_NON_IGNORABLE,       "charsets/charset/collation/rules/reset/first_non_ignorable" }, | 
| 243 |   {_CS_RESET_LAST_NON_IGNORABLE,        "charsets/charset/collation/rules/reset/last_non_ignorable" }, | 
| 244 |   {_CS_RESET_FIRST_PRIMARY_IGNORABLE,   "charsets/charset/collation/rules/reset/first_primary_ignorable" }, | 
| 245 |   {_CS_RESET_LAST_PRIMARY_IGNORABLE,    "charsets/charset/collation/rules/reset/last_primary_ignorable" }, | 
| 246 |   {_CS_RESET_FIRST_SECONDARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_secondary_ignorable" }, | 
| 247 |   {_CS_RESET_LAST_SECONDARY_IGNORABLE,  "charsets/charset/collation/rules/reset/last_secondary_ignorable" }, | 
| 248 |   {_CS_RESET_FIRST_TERTIARY_IGNORABLE,  "charsets/charset/collation/rules/reset/first_tertiary_ignorable" }, | 
| 249 |   {_CS_RESET_LAST_TERTIARY_IGNORABLE,   "charsets/charset/collation/rules/reset/last_tertiary_ignorable" }, | 
| 250 |   {_CS_RESET_FIRST_TRAILING,            "charsets/charset/collation/rules/reset/first_trailing" }, | 
| 251 |   {_CS_RESET_LAST_TRAILING,             "charsets/charset/collation/rules/reset/last_trailing" }, | 
| 252 |   {_CS_RESET_FIRST_VARIABLE,            "charsets/charset/collation/rules/reset/first_variable" }, | 
| 253 |   {_CS_RESET_LAST_VARIABLE,             "charsets/charset/collation/rules/reset/last_variable" }, | 
| 254 |  | 
| 255 |   {0,	NULL} | 
| 256 | }; | 
| 257 |  | 
| 258 | static const struct my_cs_file_section_st | 
| 259 | *cs_file_sec(const char *attr, size_t len) | 
| 260 | { | 
| 261 |   const struct my_cs_file_section_st *s; | 
| 262 |   for (s=sec; s->str; s++) | 
| 263 |   { | 
| 264 |     if (!strncmp(attr, s->str, len) && s->str[len] == 0) | 
| 265 |       return s; | 
| 266 |   } | 
| 267 |   return NULL; | 
| 268 | } | 
| 269 |  | 
| 270 | #define MY_CS_CSDESCR_SIZE	64 | 
| 271 | #define MY_CS_TAILORING_SIZE	(32*1024) | 
| 272 | #define MY_CS_UCA_VERSION_SIZE  64 | 
| 273 | #define MY_CS_CONTEXT_SIZE      64 | 
| 274 |  | 
| 275 | typedef struct my_cs_file_info | 
| 276 | { | 
| 277 |   char   csname[MY_CS_NAME_SIZE]; | 
| 278 |   char   name[MY_CS_NAME_SIZE]; | 
| 279 |   uchar  ctype[MY_CS_CTYPE_TABLE_SIZE]; | 
| 280 |   uchar  to_lower[MY_CS_TO_LOWER_TABLE_SIZE]; | 
| 281 |   uchar  to_upper[MY_CS_TO_UPPER_TABLE_SIZE]; | 
| 282 |   uchar  sort_order[MY_CS_SORT_ORDER_TABLE_SIZE]; | 
| 283 |   uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE]; | 
| 284 |   char   [MY_CS_CSDESCR_SIZE]; | 
| 285 |   char  *tailoring; | 
| 286 |   size_t tailoring_length; | 
| 287 |   size_t tailoring_alloced_length; | 
| 288 |   char   context[MY_CS_CONTEXT_SIZE]; | 
| 289 |   struct charset_info_st cs; | 
| 290 |   MY_CHARSET_LOADER *loader; | 
| 291 | } MY_CHARSET_FILE; | 
| 292 |  | 
| 293 |  | 
| 294 | static void | 
| 295 | my_charset_file_reset_charset(MY_CHARSET_FILE *i) | 
| 296 | { | 
| 297 |   memset(&i->cs, 0, sizeof(i->cs)); | 
| 298 | } | 
| 299 |  | 
| 300 |  | 
| 301 | static void | 
| 302 | my_charset_file_reset_collation(MY_CHARSET_FILE *i) | 
| 303 | { | 
| 304 |   i->tailoring_length= 0; | 
| 305 |   i->context[0]= '\0'; | 
| 306 | } | 
| 307 |  | 
| 308 |  | 
| 309 | static void | 
| 310 | my_charset_file_init(MY_CHARSET_FILE *i) | 
| 311 | { | 
| 312 |   my_charset_file_reset_charset(i); | 
| 313 |   my_charset_file_reset_collation(i); | 
| 314 |   i->tailoring= NULL; | 
| 315 |   i->tailoring_alloced_length= 0; | 
| 316 | } | 
| 317 |  | 
| 318 |  | 
| 319 | static void | 
| 320 | my_charset_file_free(MY_CHARSET_FILE *i) | 
| 321 | { | 
| 322 |   i->loader->free(i->tailoring); | 
| 323 | } | 
| 324 |  | 
| 325 |  | 
| 326 | static int | 
| 327 | my_charset_file_tailoring_realloc(MY_CHARSET_FILE *i, size_t newlen) | 
| 328 | { | 
| 329 |   if (i->tailoring_alloced_length > newlen || | 
| 330 |      (i->tailoring= i->loader->realloc(i->tailoring, | 
| 331 |                                        (i->tailoring_alloced_length= | 
| 332 |                                         (newlen + 32*1024))))) | 
| 333 |   { | 
| 334 |     return MY_XML_OK; | 
| 335 |   } | 
| 336 |   return MY_XML_ERROR; | 
| 337 | } | 
| 338 |  | 
| 339 |  | 
| 340 | static int fill_uchar(uchar *a,uint size,const char *str, size_t len) | 
| 341 | { | 
| 342 |   uint i= 0; | 
| 343 |   const char *s, *b, *e=str+len; | 
| 344 |    | 
| 345 |   for (s=str ; s < e ; i++) | 
| 346 |   {  | 
| 347 |     for ( ; (s < e) && strchr(" \t\r\n" ,s[0]); s++) ; | 
| 348 |     b=s; | 
| 349 |     for ( ; (s < e) && !strchr(" \t\r\n" ,s[0]); s++) ; | 
| 350 |     if (s == b || i > size) | 
| 351 |       break; | 
| 352 |     a[i]= (uchar) strtoul(b,NULL,16); | 
| 353 |   } | 
| 354 |   return 0; | 
| 355 | } | 
| 356 |  | 
| 357 | static int fill_uint16(uint16 *a,uint size,const char *str, size_t len) | 
| 358 | { | 
| 359 |   uint i= 0; | 
| 360 |    | 
| 361 |   const char *s, *b, *e=str+len; | 
| 362 |   for (s=str ; s < e ; i++) | 
| 363 |   {  | 
| 364 |     for ( ; (s < e) && strchr(" \t\r\n" ,s[0]); s++) ; | 
| 365 |     b=s; | 
| 366 |     for ( ; (s < e) && !strchr(" \t\r\n" ,s[0]); s++) ; | 
| 367 |     if (s == b || i > size) | 
| 368 |       break; | 
| 369 |     a[i]= (uint16) strtol(b,NULL,16); | 
| 370 |   } | 
| 371 |   return 0; | 
| 372 | } | 
| 373 |  | 
| 374 |  | 
| 375 |  | 
| 376 |  | 
| 377 | static int | 
| 378 | tailoring_append(MY_XML_PARSER *st, | 
| 379 |                  const char *fmt, size_t len, const char *attr) | 
| 380 | { | 
| 381 |   struct my_cs_file_info *i= (struct my_cs_file_info *) st->user_data; | 
| 382 |   size_t newlen= i->tailoring_length + len + 64; /* 64 for format */  | 
| 383 |   if (MY_XML_OK == my_charset_file_tailoring_realloc(i, newlen)) | 
| 384 |   { | 
| 385 |     char *dst= i->tailoring + i->tailoring_length; | 
| 386 |     sprintf(dst, fmt, (int) len, attr); | 
| 387 |     i->tailoring_length+= strlen(dst); | 
| 388 |     return MY_XML_OK; | 
| 389 |   } | 
| 390 |   return MY_XML_ERROR; | 
| 391 | } | 
| 392 |  | 
| 393 |  | 
| 394 | static int | 
| 395 | tailoring_append2(MY_XML_PARSER *st, | 
| 396 |                   const char *fmt, | 
| 397 |                   size_t len1, const char *attr1, | 
| 398 |                   size_t len2, const char *attr2) | 
| 399 | { | 
| 400 |   struct my_cs_file_info *i= (struct my_cs_file_info *) st->user_data; | 
| 401 |   size_t newlen= i->tailoring_length + len1 + len2 + 64; /* 64 for format */ | 
| 402 |   if (MY_XML_OK == my_charset_file_tailoring_realloc(i, newlen)) | 
| 403 |   { | 
| 404 |     char *dst= i->tailoring + i->tailoring_length; | 
| 405 |     sprintf(dst, fmt, (int) len1, attr1, (int) len2, attr2); | 
| 406 |     i->tailoring_length+= strlen(dst); | 
| 407 |     return MY_XML_OK; | 
| 408 |   } | 
| 409 |   return MY_XML_ERROR; | 
| 410 | } | 
| 411 |  | 
| 412 |  | 
| 413 | static size_t | 
| 414 | scan_one_character(const char *s, const char *e, my_wc_t *wc) | 
| 415 | { | 
| 416 |   CHARSET_INFO *cs= &my_charset_utf8_general_ci; | 
| 417 |   if (s >= e) | 
| 418 |     return 0; | 
| 419 |  | 
| 420 |   /* Escape sequence: \uXXXX */ | 
| 421 |   if (s[0] == '\\' && s + 2 < e && s[1] == 'u' && my_isxdigit(cs, s[2])) | 
| 422 |   { | 
| 423 |     size_t len= 3; /* We have at least one digit */ | 
| 424 |     for (s+= 3; s < e && my_isxdigit(cs, s[0]); s++, len++) | 
| 425 |     { | 
| 426 |     } | 
| 427 |     wc[0]= 0; | 
| 428 |     return len; | 
| 429 |   } | 
| 430 |   else if ((int8) s[0] > 0) /* 7-bit character */ | 
| 431 |   { | 
| 432 |     wc[0]= 0; | 
| 433 |     return 1; | 
| 434 |   } | 
| 435 |   else /* Non-escaped character */ | 
| 436 |   { | 
| 437 |     int rc= cs->cset->mb_wc(cs, wc, (uchar *) s, (uchar *) e); | 
| 438 |     if (rc > 0) | 
| 439 |       return (size_t) rc; | 
| 440 |   } | 
| 441 |   return 0; | 
| 442 | } | 
| 443 |  | 
| 444 |  | 
| 445 | static int | 
| 446 | tailoring_append_abbreviation(MY_XML_PARSER *st, | 
| 447 |                               const char *fmt, size_t len, const char *attr) | 
| 448 | { | 
| 449 |   size_t clen; | 
| 450 |   const char *attrend= attr + len; | 
| 451 |   my_wc_t wc; | 
| 452 |  | 
| 453 |   for ( ; (clen= scan_one_character(attr, attrend, &wc)) > 0; attr+= clen) | 
| 454 |   { | 
| 455 |     DBUG_ASSERT(attr < attrend); | 
| 456 |     if (tailoring_append(st, fmt, clen, attr) != MY_XML_OK) | 
| 457 |       return MY_XML_ERROR; | 
| 458 |   } | 
| 459 |   return MY_XML_OK; | 
| 460 | } | 
| 461 |  | 
| 462 |  | 
| 463 | static int cs_enter(MY_XML_PARSER *st,const char *attr, size_t len) | 
| 464 | { | 
| 465 |   struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data; | 
| 466 |   const struct my_cs_file_section_st *s= cs_file_sec(attr,len); | 
| 467 |   int state= s ? s->state : 0; | 
| 468 |    | 
| 469 |   switch (state) { | 
| 470 |   case 0: | 
| 471 |     i->loader->reporter(WARNING_LEVEL, "Unknown LDML tag: '%.*s'" , len, attr); | 
| 472 |     break; | 
| 473 |  | 
| 474 |   case _CS_CHARSET: | 
| 475 |     my_charset_file_reset_charset(i); | 
| 476 |     break; | 
| 477 |  | 
| 478 |   case _CS_COLLATION: | 
| 479 |     my_charset_file_reset_collation(i); | 
| 480 |     break; | 
| 481 |  | 
| 482 |   case _CS_RESET: | 
| 483 |     return tailoring_append(st, " &" , 0, NULL); | 
| 484 |  | 
| 485 |   default: | 
| 486 |     break; | 
| 487 |   } | 
| 488 |   return MY_XML_OK; | 
| 489 | } | 
| 490 |  | 
| 491 |  | 
| 492 | static int cs_leave(MY_XML_PARSER *st,const char *attr, size_t len) | 
| 493 | { | 
| 494 |   struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data; | 
| 495 |   const struct my_cs_file_section_st *s= cs_file_sec(attr,len); | 
| 496 |   int    state= s ? s->state : 0; | 
| 497 |   int    rc; | 
| 498 |    | 
| 499 |   switch(state){ | 
| 500 |   case _CS_COLLATION: | 
| 501 |     if (i->tailoring_length) | 
| 502 |       i->cs.tailoring= i->tailoring; | 
| 503 |     rc= i->loader->add_collation ? i->loader->add_collation(&i->cs) : MY_XML_OK; | 
| 504 |     break; | 
| 505 |  | 
| 506 |   /* Rules: Logical Reset Positions */ | 
| 507 |   case _CS_RESET_FIRST_NON_IGNORABLE: | 
| 508 |     rc= tailoring_append(st, "[first non-ignorable]" , 0, NULL); | 
| 509 |     break; | 
| 510 |  | 
| 511 |   case _CS_RESET_LAST_NON_IGNORABLE: | 
| 512 |     rc= tailoring_append(st, "[last non-ignorable]" , 0, NULL); | 
| 513 |     break; | 
| 514 |  | 
| 515 |   case _CS_RESET_FIRST_PRIMARY_IGNORABLE: | 
| 516 |     rc= tailoring_append(st, "[first primary ignorable]" , 0, NULL); | 
| 517 |     break; | 
| 518 |  | 
| 519 |   case _CS_RESET_LAST_PRIMARY_IGNORABLE: | 
| 520 |     rc= tailoring_append(st, "[last primary ignorable]" , 0, NULL); | 
| 521 |     break; | 
| 522 |  | 
| 523 |   case _CS_RESET_FIRST_SECONDARY_IGNORABLE: | 
| 524 |     rc= tailoring_append(st, "[first secondary ignorable]" , 0, NULL); | 
| 525 |     break; | 
| 526 |  | 
| 527 |   case _CS_RESET_LAST_SECONDARY_IGNORABLE: | 
| 528 |     rc= tailoring_append(st, "[last secondary ignorable]" , 0, NULL); | 
| 529 |     break; | 
| 530 |  | 
| 531 |   case _CS_RESET_FIRST_TERTIARY_IGNORABLE: | 
| 532 |     rc= tailoring_append(st, "[first tertiary ignorable]" , 0, NULL); | 
| 533 |     break; | 
| 534 |  | 
| 535 |   case _CS_RESET_LAST_TERTIARY_IGNORABLE: | 
| 536 |     rc= tailoring_append(st, "[last tertiary ignorable]" , 0, NULL); | 
| 537 |     break; | 
| 538 |  | 
| 539 |   case _CS_RESET_FIRST_TRAILING: | 
| 540 |     rc= tailoring_append(st, "[first trailing]" , 0, NULL); | 
| 541 |     break; | 
| 542 |  | 
| 543 |   case _CS_RESET_LAST_TRAILING: | 
| 544 |     rc= tailoring_append(st, "[last trailing]" , 0, NULL); | 
| 545 |     break; | 
| 546 |  | 
| 547 |   case _CS_RESET_FIRST_VARIABLE: | 
| 548 |     rc= tailoring_append(st, "[first variable]" , 0, NULL); | 
| 549 |     break; | 
| 550 |  | 
| 551 |   case _CS_RESET_LAST_VARIABLE: | 
| 552 |     rc= tailoring_append(st, "[last variable]" , 0, NULL); | 
| 553 |     break; | 
| 554 |  | 
| 555 |   default: | 
| 556 |     rc=MY_XML_OK; | 
| 557 |   } | 
| 558 |   return rc; | 
| 559 | } | 
| 560 |  | 
| 561 |  | 
| 562 | static const char *diff_fmt[5]= | 
| 563 | { | 
| 564 |   "<%.*s" , | 
| 565 |   "<<%.*s" , | 
| 566 |   "<<<%.*s" , | 
| 567 |   "<<<<%.*s" , | 
| 568 |   "=%.*s"  | 
| 569 | }; | 
| 570 |  | 
| 571 |  | 
| 572 | static const char *context_diff_fmt[5]= | 
| 573 | { | 
| 574 |   "<%.*s|%.*s" , | 
| 575 |   "<<%.*s|%.*s" , | 
| 576 |   "<<<%.*s|%.*s" , | 
| 577 |   "<<<<%.*s|%.*s" , | 
| 578 |   "=%.*s|%.*s"  | 
| 579 | }; | 
| 580 |  | 
| 581 |  | 
| 582 | static int cs_value(MY_XML_PARSER *st,const char *attr, size_t len) | 
| 583 | { | 
| 584 |   struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data; | 
| 585 |   const struct my_cs_file_section_st *s; | 
| 586 |   int    state= (int)((s= cs_file_sec(st->attr.start, | 
| 587 |                                       st->attr.end - st->attr.start)) ? | 
| 588 |                       s->state : 0); | 
| 589 |   int rc= MY_XML_OK; | 
| 590 |  | 
| 591 |   switch (state) { | 
| 592 |   case _CS_MISC: | 
| 593 |   case _CS_FAMILY: | 
| 594 |   case _CS_ORDER: | 
| 595 |     break; | 
| 596 |   case _CS_ID: | 
| 597 |     i->cs.number= strtol(attr,(char**)NULL,10); | 
| 598 |     break; | 
| 599 |   case _CS_BINARY_ID: | 
| 600 |     i->cs.binary_number= strtol(attr,(char**)NULL,10); | 
| 601 |     break; | 
| 602 |   case _CS_PRIMARY_ID: | 
| 603 |     i->cs.primary_number= strtol(attr,(char**)NULL,10); | 
| 604 |     break; | 
| 605 |   case _CS_COLNAME: | 
| 606 |     i->cs.name=mstr(i->name,attr,len,MY_CS_NAME_SIZE-1); | 
| 607 |     break; | 
| 608 |   case _CS_CSNAME: | 
| 609 |     i->cs.csname=mstr(i->csname,attr,len,MY_CS_NAME_SIZE-1); | 
| 610 |     break; | 
| 611 |   case _CS_CSDESCRIPT: | 
| 612 |     i->cs.comment=mstr(i->comment,attr,len,MY_CS_CSDESCR_SIZE-1); | 
| 613 |     break; | 
| 614 |   case _CS_FLAG: | 
| 615 |     if (!strncmp("primary" ,attr,len)) | 
| 616 |       i->cs.state|= MY_CS_PRIMARY; | 
| 617 |     else if (!strncmp("binary" ,attr,len)) | 
| 618 |       i->cs.state|= MY_CS_BINSORT; | 
| 619 |     else if (!strncmp("compiled" ,attr,len)) | 
| 620 |       i->cs.state|= MY_CS_COMPILED; | 
| 621 |     else if (!strncmp("nopad" ,attr,len)) | 
| 622 |       i->cs.state|= MY_CS_NOPAD; | 
| 623 |     break; | 
| 624 |   case _CS_UPPERMAP: | 
| 625 |     fill_uchar(i->to_upper,MY_CS_TO_UPPER_TABLE_SIZE,attr,len); | 
| 626 |     i->cs.to_upper=i->to_upper; | 
| 627 |     break; | 
| 628 |   case _CS_LOWERMAP: | 
| 629 |     fill_uchar(i->to_lower,MY_CS_TO_LOWER_TABLE_SIZE,attr,len); | 
| 630 |     i->cs.to_lower=i->to_lower; | 
| 631 |     break; | 
| 632 |   case _CS_UNIMAP: | 
| 633 |     fill_uint16(i->tab_to_uni,MY_CS_TO_UNI_TABLE_SIZE,attr,len); | 
| 634 |     i->cs.tab_to_uni=i->tab_to_uni; | 
| 635 |     break; | 
| 636 |   case _CS_COLLMAP: | 
| 637 |     fill_uchar(i->sort_order,MY_CS_SORT_ORDER_TABLE_SIZE,attr,len); | 
| 638 |     i->cs.sort_order=i->sort_order; | 
| 639 |     break; | 
| 640 |   case _CS_CTYPEMAP: | 
| 641 |     fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len); | 
| 642 |     i->cs.ctype=i->ctype; | 
| 643 |     break; | 
| 644 |  | 
| 645 |   /* Special purpose commands */ | 
| 646 |   case _CS_UCA_VERSION: | 
| 647 |     rc= tailoring_append(st, "[version %.*s]" , len, attr); | 
| 648 |     break; | 
| 649 |  | 
| 650 |   case _CS_CL_RULES_IMPORT_SOURCE: | 
| 651 |     rc= tailoring_append(st, "[import %.*s]" , len, attr); | 
| 652 |     break; | 
| 653 |  | 
| 654 |   case _CS_CL_SUPPRESS_CONTRACTIONS: | 
| 655 |     rc= tailoring_append(st, "[suppress contractions %.*s]" , len, attr); | 
| 656 |     break; | 
| 657 |  | 
| 658 |   case _CS_CL_OPTIMIZE: | 
| 659 |     rc= tailoring_append(st, "[optimize %.*s]" , len, attr); | 
| 660 |     break; | 
| 661 |  | 
| 662 |   case _CS_CL_SHIFT_AFTER_METHOD: | 
| 663 |     rc= tailoring_append(st, "[shift-after-method %.*s]" , len, attr); | 
| 664 |     break; | 
| 665 |  | 
| 666 |   /* Collation Settings */ | 
| 667 |   case _CS_ST_STRENGTH: | 
| 668 |     /* 1, 2, 3, 4, 5, or primary, secondary, tertiary, quaternary, identical */ | 
| 669 |     rc= tailoring_append(st, "[strength %.*s]" , len, attr); | 
| 670 |     if (len && attr[0] >= '1' && attr[0] <= '9') | 
| 671 |       i->cs.levels_for_order= attr[0] - '0'; | 
| 672 |     break; | 
| 673 |  | 
| 674 |   case _CS_ST_ALTERNATE: | 
| 675 |     /* non-ignorable, shifted */ | 
| 676 |     rc= tailoring_append(st, "[alternate %.*s]" , len, attr); | 
| 677 |     break; | 
| 678 |  | 
| 679 |   case _CS_ST_BACKWARDS: | 
| 680 |     /* on, off, 2 */ | 
| 681 |     rc= tailoring_append(st, "[backwards %.*s]" , len, attr); | 
| 682 |     break; | 
| 683 |  | 
| 684 |   case _CS_ST_NORMALIZATION: | 
| 685 |     /* | 
| 686 |       TODO for WL#896: check collations for normalization: vi.xml | 
| 687 |       We want precomposed characters work well at this point. | 
| 688 |     */ | 
| 689 |     /* on, off */ | 
| 690 |     rc= tailoring_append(st, "[normalization %.*s]" , len, attr); | 
| 691 |     break; | 
| 692 |  | 
| 693 |   case _CS_ST_CASE_LEVEL: | 
| 694 |     /* on, off */ | 
| 695 |     rc= tailoring_append(st, "[caseLevel %.*s]" , len, attr); | 
| 696 |     break; | 
| 697 |  | 
| 698 |   case _CS_ST_CASE_FIRST: | 
| 699 |     /* upper, lower, off */ | 
| 700 |     rc= tailoring_append(st, "[caseFirst %.*s]" , len, attr); | 
| 701 |     break; | 
| 702 |  | 
| 703 |   case _CS_ST_HIRAGANA_QUATERNARY: | 
| 704 |     /* on, off */ | 
| 705 |     rc= tailoring_append(st, "[hiraganaQ %.*s]" , len, attr); | 
| 706 |     break; | 
| 707 |  | 
| 708 |   case _CS_ST_NUMERIC: | 
| 709 |     /* on, off */ | 
| 710 |     rc= tailoring_append(st, "[numeric %.*s]" , len, attr); | 
| 711 |     break; | 
| 712 |  | 
| 713 |   case _CS_ST_VARIABLE_TOP: | 
| 714 |     /* TODO for WL#896: check value format */ | 
| 715 |     rc= tailoring_append(st, "[variableTop %.*s]" , len, attr); | 
| 716 |     break; | 
| 717 |  | 
| 718 |   case _CS_ST_MATCH_BOUNDARIES: | 
| 719 |     /* none, whole-character, whole-word */ | 
| 720 |     rc= tailoring_append(st, "[match-boundaries %.*s]" , len, attr); | 
| 721 |     break; | 
| 722 |  | 
| 723 |   case _CS_ST_MATCH_STYLE: | 
| 724 |     /* minimal, medial, maximal */ | 
| 725 |     rc= tailoring_append(st, "[match-style %.*s]" , len, attr); | 
| 726 |     break; | 
| 727 |  | 
| 728 |  | 
| 729 |   /* Rules */ | 
| 730 |   case _CS_RESET: | 
| 731 |     rc= tailoring_append(st, "%.*s" , len, attr); | 
| 732 |     break; | 
| 733 |  | 
| 734 |   case _CS_DIFF1: | 
| 735 |   case _CS_DIFF2: | 
| 736 |   case _CS_DIFF3: | 
| 737 |   case _CS_DIFF4: | 
| 738 |   case _CS_IDENTICAL: | 
| 739 |     rc= tailoring_append(st, diff_fmt[state - _CS_DIFF1], len, attr); | 
| 740 |     break; | 
| 741 |  | 
| 742 |  | 
| 743 |   /* Rules: Expansion */ | 
| 744 |   case _CS_EXP_EXTEND: | 
| 745 |     rc= tailoring_append(st, " / %.*s" , len, attr); | 
| 746 |     break; | 
| 747 |  | 
| 748 |   case _CS_EXP_DIFF1: | 
| 749 |   case _CS_EXP_DIFF2: | 
| 750 |   case _CS_EXP_DIFF3: | 
| 751 |   case _CS_EXP_DIFF4: | 
| 752 |   case _CS_EXP_IDENTICAL: | 
| 753 |     if (i->context[0]) | 
| 754 |     { | 
| 755 |       rc= tailoring_append2(st, context_diff_fmt[state - _CS_EXP_DIFF1], | 
| 756 |                             strlen(i->context), i->context, len, attr); | 
| 757 |       i->context[0]= 0; | 
| 758 |     } | 
| 759 |     else | 
| 760 |       rc= tailoring_append(st, diff_fmt[state  - _CS_EXP_DIFF1], len, attr); | 
| 761 |     break; | 
| 762 |  | 
| 763 |   /* Rules: Context */ | 
| 764 |   case _CS_CONTEXT: | 
| 765 |     if (len < sizeof(i->context)) | 
| 766 |     { | 
| 767 |       memcpy(i->context, attr, len); | 
| 768 |       i->context[len]= '\0'; | 
| 769 |     } | 
| 770 |     break; | 
| 771 |  | 
| 772 |   /* Rules: Abbreviating Ordering Specifications */ | 
| 773 |   case _CS_A_DIFF1: | 
| 774 |   case _CS_A_DIFF2: | 
| 775 |   case _CS_A_DIFF3: | 
| 776 |   case _CS_A_DIFF4: | 
| 777 |   case _CS_A_IDENTICAL: | 
| 778 |     rc= tailoring_append_abbreviation(st, diff_fmt[state - _CS_A_DIFF1], len, attr); | 
| 779 |     break; | 
| 780 |  | 
| 781 |   /* Rules: Placing Characters Before Others */ | 
| 782 |   case _CS_RESET_BEFORE: | 
| 783 |     /* | 
| 784 |       TODO for WL#896: Add this check into text customization parser: | 
| 785 |       It is an error if the strength of the before relation is not identical | 
| 786 |       to the relation after the reset. We'll need this for WL#896. | 
| 787 |     */ | 
| 788 |     rc= tailoring_append(st, "[before %.*s]" , len, attr); | 
| 789 |     break; | 
| 790 |  | 
| 791 |  | 
| 792 |   default: | 
| 793 |     break; | 
| 794 |   } | 
| 795 |  | 
| 796 |   return rc; | 
| 797 | } | 
| 798 |  | 
| 799 |  | 
| 800 | my_bool | 
| 801 | my_parse_charset_xml(MY_CHARSET_LOADER *loader, const char *buf, size_t len) | 
| 802 | { | 
| 803 |   MY_XML_PARSER p; | 
| 804 |   struct my_cs_file_info info; | 
| 805 |   my_bool rc; | 
| 806 |    | 
| 807 |   my_charset_file_init(&info); | 
| 808 |   my_xml_parser_create(&p); | 
| 809 |   my_xml_set_enter_handler(&p,cs_enter); | 
| 810 |   my_xml_set_value_handler(&p,cs_value); | 
| 811 |   my_xml_set_leave_handler(&p,cs_leave); | 
| 812 |   info.loader= loader; | 
| 813 |   my_xml_set_user_data(&p, (void *) &info); | 
| 814 |   rc= (my_xml_parse(&p,buf,len) == MY_XML_OK) ? FALSE : TRUE; | 
| 815 |   my_xml_parser_free(&p); | 
| 816 |   my_charset_file_free(&info); | 
| 817 |   if (rc != MY_XML_OK) | 
| 818 |   { | 
| 819 |     const char *errstr= my_xml_error_string(&p); | 
| 820 |     if (sizeof(loader->error) > 32 + strlen(errstr)) | 
| 821 |     { | 
| 822 |       /* We cannot use my_snprintf() here. See previous comment. */ | 
| 823 |       sprintf(loader->error, "at line %d pos %d: %s" , | 
| 824 |                 my_xml_error_lineno(&p)+1, | 
| 825 |                 (int) my_xml_error_pos(&p), | 
| 826 |                 my_xml_error_string(&p)); | 
| 827 |     } | 
| 828 |   } | 
| 829 |   return rc; | 
| 830 | } | 
| 831 |  | 
| 832 |  | 
| 833 | uint | 
| 834 | my_string_repertoire_8bit(CHARSET_INFO *cs, const char *str, size_t length) | 
| 835 | { | 
| 836 |   const char *strend; | 
| 837 |   if ((cs->state & MY_CS_NONASCII) && length > 0) | 
| 838 |     return MY_REPERTOIRE_UNICODE30; | 
| 839 |   for (strend= str + length; str < strend; str++) | 
| 840 |   { | 
| 841 |     if (((uchar) *str) > 0x7F) | 
| 842 |       return MY_REPERTOIRE_UNICODE30; | 
| 843 |   } | 
| 844 |   return MY_REPERTOIRE_ASCII; | 
| 845 | } | 
| 846 |  | 
| 847 |  | 
| 848 | static void | 
| 849 | my_string_metadata_init(MY_STRING_METADATA *metadata) | 
| 850 | { | 
| 851 |   metadata->repertoire= MY_REPERTOIRE_ASCII; | 
| 852 |   metadata->char_length= 0; | 
| 853 | } | 
| 854 |  | 
| 855 |  | 
| 856 | /** | 
| 857 |   This should probably eventually go as a virtual function into | 
| 858 |   MY_CHARSET_HANDLER or MY_COLLATION_HANDLER. | 
| 859 | */ | 
| 860 | static void | 
| 861 | my_string_metadata_get_mb(MY_STRING_METADATA *metadata, | 
| 862 |                           CHARSET_INFO *cs, const char *str, ulong length) | 
| 863 | { | 
| 864 |   const char *strend= str + length; | 
| 865 |   for (my_string_metadata_init(metadata) ; | 
| 866 |        str < strend; | 
| 867 |        metadata->char_length++) | 
| 868 |   { | 
| 869 |     my_wc_t wc; | 
| 870 |     int mblen= cs->cset->mb_wc(cs, &wc, (const uchar *) str, | 
| 871 |                                         (const uchar *) strend); | 
| 872 |     if (mblen > 0) /* Assigned character */ | 
| 873 |     { | 
| 874 |       if (wc > 0x7F) | 
| 875 |         metadata->repertoire|= MY_REPERTOIRE_EXTENDED; | 
| 876 |       str+= mblen; | 
| 877 |     } | 
| 878 |     else if (mblen == MY_CS_ILSEQ) /* Bad byte sequence */ | 
| 879 |     { | 
| 880 |       metadata->repertoire|= MY_REPERTOIRE_EXTENDED; | 
| 881 |       str++; | 
| 882 |     } | 
| 883 |     else if (mblen > MY_CS_TOOSMALL) /* Unassigned character */  | 
| 884 |     { | 
| 885 |       metadata->repertoire|= MY_REPERTOIRE_EXTENDED; | 
| 886 |       str+= (-mblen); | 
| 887 |     } | 
| 888 |     else /* Incomplete character, premature end-of-line */ | 
| 889 |     { | 
| 890 |       metadata->repertoire|= MY_REPERTOIRE_EXTENDED; /* Just in case */ | 
| 891 |       break; | 
| 892 |     } | 
| 893 |   } | 
| 894 | } | 
| 895 |  | 
| 896 |  | 
| 897 | /** | 
| 898 |   Collect string metadata: length in characters and repertoire. | 
| 899 | */ | 
| 900 | void | 
| 901 | my_string_metadata_get(MY_STRING_METADATA *metadata, | 
| 902 |                        CHARSET_INFO *cs, const char *str, size_t length) | 
| 903 | { | 
| 904 |   if (cs->mbmaxlen == 1 && !(cs->state & MY_CS_NONASCII)) | 
| 905 |   { | 
| 906 |     metadata->char_length= length; | 
| 907 |     metadata->repertoire= my_string_repertoire_8bit(cs, str, (ulong)length); | 
| 908 |   } | 
| 909 |   else | 
| 910 |   { | 
| 911 |     my_string_metadata_get_mb(metadata, cs, str, (ulong)length); | 
| 912 |   } | 
| 913 | } | 
| 914 |  | 
| 915 |  | 
| 916 | /* | 
| 917 |   Check repertoire: detect pure ascii strings | 
| 918 | */ | 
| 919 | uint | 
| 920 | my_string_repertoire(CHARSET_INFO *cs, const char *str, size_t length) | 
| 921 | { | 
| 922 |   if (cs->mbminlen == 1 && !(cs->state & MY_CS_NONASCII)) | 
| 923 |   { | 
| 924 |     return my_string_repertoire_8bit(cs, str, length); | 
| 925 |   } | 
| 926 |   else | 
| 927 |   { | 
| 928 |     const char *strend= str + length; | 
| 929 |     my_wc_t wc; | 
| 930 |     int chlen; | 
| 931 |     for (; | 
| 932 |          (chlen= cs->cset->mb_wc(cs, &wc, (uchar*) str, (uchar*) strend)) > 0; | 
| 933 |          str+= chlen) | 
| 934 |     { | 
| 935 |       if (wc > 0x7F) | 
| 936 |         return MY_REPERTOIRE_UNICODE30; | 
| 937 |     } | 
| 938 |   } | 
| 939 |   return MY_REPERTOIRE_ASCII; | 
| 940 | } | 
| 941 |  | 
| 942 |  | 
| 943 | /* | 
| 944 |   Returns repertoire for charset | 
| 945 | */ | 
| 946 | uint my_charset_repertoire(CHARSET_INFO *cs) | 
| 947 | { | 
| 948 |   return cs->state & MY_CS_PUREASCII ? | 
| 949 |     MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30; | 
| 950 | } | 
| 951 |  | 
| 952 |  | 
| 953 | /* | 
| 954 |   Detect whether a character set is ASCII compatible. | 
| 955 |  | 
| 956 |   Returns TRUE for: | 
| 957 |    | 
| 958 |   - all 8bit character sets whose Unicode mapping of 0x7B is '{' | 
| 959 |     (ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS") | 
| 960 |    | 
| 961 |   - all multi-byte character sets having mbminlen == 1 | 
| 962 |     (ignores ucs2 whose mbminlen is 2) | 
| 963 |    | 
| 964 |   TODO: | 
| 965 |    | 
| 966 |   When merging to 5.2, this function should be changed | 
| 967 |   to check a new flag MY_CS_NONASCII,  | 
| 968 |    | 
| 969 |      return (cs->flag & MY_CS_NONASCII) ? 0 : 1; | 
| 970 |    | 
| 971 |   This flag was previously added into 5.2 under terms | 
| 972 |   of WL#3759 "Optimize identifier conversion in client-server protocol" | 
| 973 |   especially to mark character sets not compatible with ASCII. | 
| 974 |    | 
| 975 |   We won't backport this flag to 5.0 or 5.1. | 
| 976 |   This function is Ok for 5.0 and 5.1, because we're not going | 
| 977 |   to introduce new tricky character sets between 5.0 and 5.2. | 
| 978 | */ | 
| 979 | my_bool | 
| 980 | my_charset_is_ascii_based(CHARSET_INFO *cs) | 
| 981 | { | 
| 982 |   return  | 
| 983 |     (cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') || | 
| 984 |     (cs->mbminlen == 1 && cs->mbmaxlen > 1); | 
| 985 | } | 
| 986 |  | 
| 987 |  | 
| 988 | /* | 
| 989 |   Convert a string between two character sets. | 
| 990 |   'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes. | 
| 991 |  | 
| 992 |   @param  to[OUT]       Store result here | 
| 993 |   @param  to_length     Size of "to" buffer | 
| 994 |   @param  to_cs         Character set of result string | 
| 995 |   @param  from          Copy from here | 
| 996 |   @param  from_length   Length of the "from" string | 
| 997 |   @param  from_cs       Character set of the "from" string | 
| 998 |   @param  errors[OUT]   Number of conversion errors | 
| 999 |  | 
| 1000 |   @return Number of bytes copied to 'to' string | 
| 1001 | */ | 
| 1002 |  | 
| 1003 | uint32 | 
| 1004 | my_convert_using_func(char *to, size_t to_length, | 
| 1005 |                       CHARSET_INFO *to_cs, my_charset_conv_wc_mb wc_mb, | 
| 1006 |                       const char *from, size_t from_length, | 
| 1007 |                       CHARSET_INFO *from_cs, my_charset_conv_mb_wc mb_wc, | 
| 1008 |                       uint *errors) | 
| 1009 | { | 
| 1010 |   int         cnvres; | 
| 1011 |   my_wc_t     wc; | 
| 1012 |   const uchar *from_end= (const uchar*) from + from_length; | 
| 1013 |   char *to_start= to; | 
| 1014 |   uchar *to_end= (uchar*) to + to_length; | 
| 1015 |   uint error_count= 0; | 
| 1016 |  | 
| 1017 |   while (1) | 
| 1018 |   { | 
| 1019 |     if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0) | 
| 1020 |       from+= cnvres; | 
| 1021 |     else if (cnvres == MY_CS_ILSEQ) | 
| 1022 |     { | 
| 1023 |       error_count++; | 
| 1024 |       from++; | 
| 1025 |       wc= '?'; | 
| 1026 |     } | 
| 1027 |     else if (cnvres > MY_CS_TOOSMALL) | 
| 1028 |     { | 
| 1029 |       /* | 
| 1030 |         A correct multibyte sequence detected | 
| 1031 |         But it doesn't have Unicode mapping. | 
| 1032 |       */ | 
| 1033 |       error_count++; | 
| 1034 |       from+= (-cnvres); | 
| 1035 |       wc= '?'; | 
| 1036 |     } | 
| 1037 |     else | 
| 1038 |     { | 
| 1039 |       if ((uchar *) from >= from_end) | 
| 1040 |         break;  /* End of line */ | 
| 1041 |       /* Incomplete byte sequence */ | 
| 1042 |       error_count++; | 
| 1043 |       from++; | 
| 1044 |       wc= '?'; | 
| 1045 |     } | 
| 1046 |  | 
| 1047 | outp: | 
| 1048 |     if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0) | 
| 1049 |       to+= cnvres; | 
| 1050 |     else if (cnvres == MY_CS_ILUNI && wc != '?') | 
| 1051 |     { | 
| 1052 |       error_count++; | 
| 1053 |       wc= '?'; | 
| 1054 |       goto outp; | 
| 1055 |     } | 
| 1056 |     else | 
| 1057 |       break; | 
| 1058 |   } | 
| 1059 |   *errors= error_count; | 
| 1060 |   return (uint32) (to - to_start); | 
| 1061 | } | 
| 1062 |  | 
| 1063 |  | 
| 1064 | /* | 
| 1065 |   Convert a string between two character sets. | 
| 1066 |    Optimized for quick copying of ASCII characters in the range 0x00..0x7F. | 
| 1067 |   'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes. | 
| 1068 |  | 
| 1069 |   @param  to[OUT]       Store result here | 
| 1070 |   @param  to_length     Size of "to" buffer | 
| 1071 |   @param  to_cs         Character set of result string | 
| 1072 |   @param  from          Copy from here | 
| 1073 |   @param  from_length   Length of the "from" string | 
| 1074 |   @param  from_cs       Character set of the "from" string | 
| 1075 |   @param  errors[OUT]   Number of conversion errors | 
| 1076 |  | 
| 1077 |   @return Number of bytes copied to 'to' string | 
| 1078 | */ | 
| 1079 |  | 
| 1080 | uint32 | 
| 1081 | my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, | 
| 1082 |            const char *from, uint32 from_length, | 
| 1083 |            CHARSET_INFO *from_cs, uint *errors) | 
| 1084 | { | 
| 1085 |   uint32 length, length2; | 
| 1086 |   /* | 
| 1087 |     If any of the character sets is not ASCII compatible, | 
| 1088 |     immediately switch to slow mb_wc->wc_mb method. | 
| 1089 |   */ | 
| 1090 |   if ((to_cs->state | from_cs->state) & MY_CS_NONASCII) | 
| 1091 |     return my_convert_using_func(to, to_length, | 
| 1092 |                                  to_cs, to_cs->cset->wc_mb, | 
| 1093 |                                  from, from_length, | 
| 1094 |                                  from_cs, from_cs->cset->mb_wc, | 
| 1095 |                                  errors); | 
| 1096 |  | 
| 1097 |   length= length2= MY_MIN(to_length, from_length); | 
| 1098 |  | 
| 1099 | #if defined(__i386__) || defined(__x86_64__) | 
| 1100 |   /* | 
| 1101 |     Special loop for i386, it allows to refer to a | 
| 1102 |     non-aligned memory block as UINT32, which makes | 
| 1103 |     it possible to copy four bytes at once. This | 
| 1104 |     gives about 10% performance improvement comparing | 
| 1105 |     to byte-by-byte loop. | 
| 1106 |   */ | 
| 1107 |   for ( ; length >= 4; length-= 4, from+= 4, to+= 4) | 
| 1108 |   { | 
| 1109 |     if ((*(uint32*)from) & 0x80808080) | 
| 1110 |       break; | 
| 1111 |     *((uint32*) to)= *((const uint32*) from); | 
| 1112 |   } | 
| 1113 | #endif /* __i386__ */ | 
| 1114 |  | 
| 1115 |   for (; ; *to++= *from++, length--) | 
| 1116 |   { | 
| 1117 |     if (!length) | 
| 1118 |     { | 
| 1119 |       *errors= 0; | 
| 1120 |       return length2; | 
| 1121 |     } | 
| 1122 |     if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */ | 
| 1123 |     { | 
| 1124 |       uint32 copied_length= length2 - length; | 
| 1125 |       to_length-= copied_length; | 
| 1126 |       from_length-= copied_length; | 
| 1127 |       return copied_length + my_convert_using_func(to, to_length, to_cs, | 
| 1128 |                                                    to_cs->cset->wc_mb, | 
| 1129 |                                                    from, from_length, from_cs, | 
| 1130 |                                                    from_cs->cset->mb_wc, | 
| 1131 |                                                    errors); | 
| 1132 |     } | 
| 1133 |   } | 
| 1134 |  | 
| 1135 |   DBUG_ASSERT(FALSE); // Should never get to here | 
| 1136 |   return 0;           // Make compiler happy | 
| 1137 | } | 
| 1138 |  | 
| 1139 |  | 
| 1140 | size_t | 
| 1141 | my_convert_fix(CHARSET_INFO *to_cs, char *to, size_t to_length, | 
| 1142 |                CHARSET_INFO *from_cs, const char *from, size_t from_length, | 
| 1143 |                size_t nchars, | 
| 1144 |                MY_STRCOPY_STATUS *copy_status, | 
| 1145 |                MY_STRCONV_STATUS *conv_status) | 
| 1146 | { | 
| 1147 |   int cnvres; | 
| 1148 |   my_wc_t wc; | 
| 1149 |   my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc; | 
| 1150 |   my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb; | 
| 1151 |   const uchar *from_end= (const uchar*) from + from_length; | 
| 1152 |   uchar *to_end= (uchar*) to + to_length; | 
| 1153 |   char *to_start= to; | 
| 1154 |  | 
| 1155 |   DBUG_ASSERT(to_cs != &my_charset_bin); | 
| 1156 |   DBUG_ASSERT(from_cs != &my_charset_bin); | 
| 1157 |  | 
| 1158 |   copy_status->m_well_formed_error_pos= NULL; | 
| 1159 |   conv_status->m_cannot_convert_error_pos= NULL; | 
| 1160 |  | 
| 1161 |   for ( ; nchars; nchars--) | 
| 1162 |   { | 
| 1163 |     const char *from_prev= from; | 
| 1164 |     if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0) | 
| 1165 |       from+= cnvres; | 
| 1166 |     else if (cnvres == MY_CS_ILSEQ) | 
| 1167 |     { | 
| 1168 |       if (!copy_status->m_well_formed_error_pos) | 
| 1169 |         copy_status->m_well_formed_error_pos= from; | 
| 1170 |       from++; | 
| 1171 |       wc= '?'; | 
| 1172 |     } | 
| 1173 |     else if (cnvres > MY_CS_TOOSMALL) | 
| 1174 |     { | 
| 1175 |       /* | 
| 1176 |         A correct multibyte sequence detected | 
| 1177 |         But it doesn't have Unicode mapping. | 
| 1178 |       */ | 
| 1179 |       if (!conv_status->m_cannot_convert_error_pos) | 
| 1180 |         conv_status->m_cannot_convert_error_pos= from; | 
| 1181 |       from+= (-cnvres); | 
| 1182 |       wc= '?'; | 
| 1183 |     } | 
| 1184 |     else | 
| 1185 |     { | 
| 1186 |       if ((uchar *) from >= from_end) | 
| 1187 |         break; // End of line | 
| 1188 |       // Incomplete byte sequence | 
| 1189 |       if (!copy_status->m_well_formed_error_pos) | 
| 1190 |         copy_status->m_well_formed_error_pos= from; | 
| 1191 |       from++; | 
| 1192 |       wc= '?'; | 
| 1193 |     } | 
| 1194 | outp: | 
| 1195 |     if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0) | 
| 1196 |       to+= cnvres; | 
| 1197 |     else if (cnvres == MY_CS_ILUNI && wc != '?') | 
| 1198 |     { | 
| 1199 |       if (!conv_status->m_cannot_convert_error_pos) | 
| 1200 |         conv_status->m_cannot_convert_error_pos= from_prev; | 
| 1201 |       wc= '?'; | 
| 1202 |       goto outp; | 
| 1203 |     } | 
| 1204 |     else | 
| 1205 |     { | 
| 1206 |       from= from_prev; | 
| 1207 |       break; | 
| 1208 |     } | 
| 1209 |   } | 
| 1210 |   copy_status->m_source_end_pos= from; | 
| 1211 |   return to - to_start; | 
| 1212 | } | 
| 1213 |  |