| 1 | /* Copyright (c) 2000, 2013, Oracle and/or its affiliates. |
| 2 | Copyright (c) 2009, 2014, SkySQL Ab. |
| 3 | |
| 4 | This program is free software; you can redistribute it and/or modify |
| 5 | it under the terms of the GNU General Public License as published by |
| 6 | the Free Software Foundation; version 2 of the License. |
| 7 | |
| 8 | This program is distributed in the hope that it will be useful, |
| 9 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 11 | GNU General Public License for more details. |
| 12 | |
| 13 | You should have received a copy of the GNU General Public License |
| 14 | along with this program; if not, write to the Free Software |
| 15 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ |
| 16 | |
| 17 | #include "strings_def.h" |
| 18 | #include <m_ctype.h> |
| 19 | #include <my_xml.h> |
| 20 | |
| 21 | /* |
| 22 | |
| 23 | This files implements routines which parse XML based |
| 24 | character set and collation description files. |
| 25 | |
| 26 | Unicode collations are encoded according to |
| 27 | |
| 28 | Unicode Technical Standard #35 |
| 29 | Locale Data Markup Language (LDML) |
| 30 | http://www.unicode.org/reports/tr35/ |
| 31 | |
| 32 | and converted into ICU string according to |
| 33 | |
| 34 | Collation Customization |
| 35 | http://oss.software.ibm.com/icu/userguide/Collate_Customization.html |
| 36 | |
| 37 | */ |
| 38 | |
| 39 | |
| 40 | /* |
| 41 | Avoid using my_snprintf |
| 42 | We cannot use my_snprintf() here, because ctype.o is |
| 43 | used to build conf_to_src, which must require minimum |
| 44 | dependency. |
| 45 | */ |
| 46 | |
| 47 | #undef my_snprinf |
| 48 | #define my_snprintf "We cannot use my_snprintf in this file" |
| 49 | |
| 50 | |
| 51 | int (*my_string_stack_guard)(int)= NULL; |
| 52 | |
| 53 | static char *mstr(char *str,const char *src,size_t l1,size_t l2) |
| 54 | { |
| 55 | l1= l1<l2 ? l1 : l2; |
| 56 | memcpy(str,src,l1); |
| 57 | str[l1]='\0'; |
| 58 | return str; |
| 59 | } |
| 60 | |
| 61 | struct my_cs_file_section_st |
| 62 | { |
| 63 | int state; |
| 64 | const char *str; |
| 65 | }; |
| 66 | |
| 67 | #define _CS_MISC 1 |
| 68 | #define _CS_ID 2 |
| 69 | #define _CS_CSNAME 3 |
| 70 | #define _CS_FAMILY 4 |
| 71 | #define _CS_ORDER 5 |
| 72 | #define _CS_COLNAME 6 |
| 73 | #define _CS_FLAG 7 |
| 74 | #define _CS_CHARSET 8 |
| 75 | #define _CS_COLLATION 9 |
| 76 | #define _CS_UPPERMAP 10 |
| 77 | #define _CS_LOWERMAP 11 |
| 78 | #define _CS_UNIMAP 12 |
| 79 | #define _CS_COLLMAP 13 |
| 80 | #define _CS_CTYPEMAP 14 |
| 81 | #define _CS_PRIMARY_ID 15 |
| 82 | #define _CS_BINARY_ID 16 |
| 83 | #define _CS_CSDESCRIPT 17 |
| 84 | |
| 85 | |
| 86 | /* Special purpose commands */ |
| 87 | #define _CS_UCA_VERSION 100 |
| 88 | #define _CS_CL_SUPPRESS_CONTRACTIONS 101 |
| 89 | #define _CS_CL_OPTIMIZE 102 |
| 90 | #define _CS_CL_SHIFT_AFTER_METHOD 103 |
| 91 | #define _CS_CL_RULES_IMPORT 104 |
| 92 | #define _CS_CL_RULES_IMPORT_SOURCE 105 |
| 93 | |
| 94 | |
| 95 | /* Collation Settings */ |
| 96 | #define _CS_ST_SETTINGS 200 |
| 97 | #define _CS_ST_STRENGTH 201 |
| 98 | #define _CS_ST_ALTERNATE 202 |
| 99 | #define _CS_ST_BACKWARDS 203 |
| 100 | #define _CS_ST_NORMALIZATION 204 |
| 101 | #define _CS_ST_CASE_LEVEL 205 |
| 102 | #define _CS_ST_CASE_FIRST 206 |
| 103 | #define _CS_ST_HIRAGANA_QUATERNARY 207 |
| 104 | #define _CS_ST_NUMERIC 208 |
| 105 | #define _CS_ST_VARIABLE_TOP 209 |
| 106 | #define _CS_ST_MATCH_BOUNDARIES 210 |
| 107 | #define _CS_ST_MATCH_STYLE 211 |
| 108 | |
| 109 | |
| 110 | /* Rules */ |
| 111 | #define _CS_RULES 300 |
| 112 | #define _CS_RESET 301 |
| 113 | #define _CS_DIFF1 302 |
| 114 | #define _CS_DIFF2 303 |
| 115 | #define _CS_DIFF3 304 |
| 116 | #define _CS_DIFF4 305 |
| 117 | #define _CS_IDENTICAL 306 |
| 118 | |
| 119 | /* Rules: Expansions */ |
| 120 | #define _CS_EXP_X 320 |
| 121 | #define _CS_EXP_EXTEND 321 |
| 122 | #define _CS_EXP_DIFF1 322 |
| 123 | #define _CS_EXP_DIFF2 323 |
| 124 | #define _CS_EXP_DIFF3 324 |
| 125 | #define _CS_EXP_DIFF4 325 |
| 126 | #define _CS_EXP_IDENTICAL 326 |
| 127 | |
| 128 | /* Rules: Abbreviating Ordering Specifications */ |
| 129 | #define _CS_A_DIFF1 351 |
| 130 | #define _CS_A_DIFF2 352 |
| 131 | #define _CS_A_DIFF3 353 |
| 132 | #define _CS_A_DIFF4 354 |
| 133 | #define _CS_A_IDENTICAL 355 |
| 134 | |
| 135 | /* Rules: previous context */ |
| 136 | #define _CS_CONTEXT 370 |
| 137 | |
| 138 | /* Rules: Placing Characters Before Others*/ |
| 139 | #define _CS_RESET_BEFORE 380 |
| 140 | |
| 141 | /* Rules: Logical Reset Positions */ |
| 142 | #define _CS_RESET_FIRST_PRIMARY_IGNORABLE 401 |
| 143 | #define _CS_RESET_LAST_PRIMARY_IGNORABLE 402 |
| 144 | #define _CS_RESET_FIRST_SECONDARY_IGNORABLE 403 |
| 145 | #define _CS_RESET_LAST_SECONDARY_IGNORABLE 404 |
| 146 | #define _CS_RESET_FIRST_TERTIARY_IGNORABLE 405 |
| 147 | #define _CS_RESET_LAST_TERTIARY_IGNORABLE 406 |
| 148 | #define _CS_RESET_FIRST_TRAILING 407 |
| 149 | #define _CS_RESET_LAST_TRAILING 408 |
| 150 | #define _CS_RESET_FIRST_VARIABLE 409 |
| 151 | #define _CS_RESET_LAST_VARIABLE 410 |
| 152 | #define _CS_RESET_FIRST_NON_IGNORABLE 411 |
| 153 | #define _CS_RESET_LAST_NON_IGNORABLE 412 |
| 154 | |
| 155 | |
| 156 | |
| 157 | static const struct my_cs_file_section_st sec[] = |
| 158 | { |
| 159 | {_CS_MISC, "xml" }, |
| 160 | {_CS_MISC, "xml/version" }, |
| 161 | {_CS_MISC, "xml/encoding" }, |
| 162 | {_CS_MISC, "charsets" }, |
| 163 | {_CS_MISC, "charsets/max-id" }, |
| 164 | {_CS_MISC, "charsets/copyright" }, |
| 165 | {_CS_MISC, "charsets/description" }, |
| 166 | {_CS_CHARSET, "charsets/charset" }, |
| 167 | {_CS_PRIMARY_ID, "charsets/charset/primary-id" }, |
| 168 | {_CS_BINARY_ID, "charsets/charset/binary-id" }, |
| 169 | {_CS_CSNAME, "charsets/charset/name" }, |
| 170 | {_CS_FAMILY, "charsets/charset/family" }, |
| 171 | {_CS_CSDESCRIPT, "charsets/charset/description" }, |
| 172 | {_CS_MISC, "charsets/charset/alias" }, |
| 173 | {_CS_MISC, "charsets/charset/ctype" }, |
| 174 | {_CS_CTYPEMAP, "charsets/charset/ctype/map" }, |
| 175 | {_CS_MISC, "charsets/charset/upper" }, |
| 176 | {_CS_UPPERMAP, "charsets/charset/upper/map" }, |
| 177 | {_CS_MISC, "charsets/charset/lower" }, |
| 178 | {_CS_LOWERMAP, "charsets/charset/lower/map" }, |
| 179 | {_CS_MISC, "charsets/charset/unicode" }, |
| 180 | {_CS_UNIMAP, "charsets/charset/unicode/map" }, |
| 181 | {_CS_COLLATION, "charsets/charset/collation" }, |
| 182 | {_CS_COLNAME, "charsets/charset/collation/name" }, |
| 183 | {_CS_ID, "charsets/charset/collation/id" }, |
| 184 | {_CS_ORDER, "charsets/charset/collation/order" }, |
| 185 | {_CS_FLAG, "charsets/charset/collation/flag" }, |
| 186 | {_CS_COLLMAP, "charsets/charset/collation/map" }, |
| 187 | |
| 188 | /* Special purpose commands */ |
| 189 | {_CS_UCA_VERSION, "charsets/charset/collation/version" }, |
| 190 | {_CS_CL_SUPPRESS_CONTRACTIONS, "charsets/charset/collation/suppress_contractions" }, |
| 191 | {_CS_CL_OPTIMIZE, "charsets/charset/collation/optimize" }, |
| 192 | {_CS_CL_SHIFT_AFTER_METHOD, "charsets/charset/collation/shift-after-method" }, |
| 193 | {_CS_CL_RULES_IMPORT, "charsets/charset/collation/rules/import" }, |
| 194 | {_CS_CL_RULES_IMPORT_SOURCE, "charsets/charset/collation/rules/import/source" }, |
| 195 | |
| 196 | /* Collation Settings */ |
| 197 | {_CS_ST_SETTINGS, "charsets/charset/collation/settings" }, |
| 198 | {_CS_ST_STRENGTH, "charsets/charset/collation/settings/strength" }, |
| 199 | {_CS_ST_ALTERNATE, "charsets/charset/collation/settings/alternate" }, |
| 200 | {_CS_ST_BACKWARDS, "charsets/charset/collation/settings/backwards" }, |
| 201 | {_CS_ST_NORMALIZATION, "charsets/charset/collation/settings/normalization" }, |
| 202 | {_CS_ST_CASE_LEVEL, "charsets/charset/collation/settings/caseLevel" }, |
| 203 | {_CS_ST_CASE_FIRST, "charsets/charset/collation/settings/caseFirst" }, |
| 204 | {_CS_ST_HIRAGANA_QUATERNARY, "charsets/charset/collation/settings/hiraganaQuaternary" }, |
| 205 | {_CS_ST_NUMERIC, "charsets/charset/collation/settings/numeric" }, |
| 206 | {_CS_ST_VARIABLE_TOP, "charsets/charset/collation/settings/variableTop" }, |
| 207 | {_CS_ST_MATCH_BOUNDARIES, "charsets/charset/collation/settings/match-boundaries" }, |
| 208 | {_CS_ST_MATCH_STYLE, "charsets/charset/collation/settings/match-style" }, |
| 209 | |
| 210 | /* Rules */ |
| 211 | {_CS_RULES, "charsets/charset/collation/rules" }, |
| 212 | {_CS_RESET, "charsets/charset/collation/rules/reset" }, |
| 213 | {_CS_DIFF1, "charsets/charset/collation/rules/p" }, |
| 214 | {_CS_DIFF2, "charsets/charset/collation/rules/s" }, |
| 215 | {_CS_DIFF3, "charsets/charset/collation/rules/t" }, |
| 216 | {_CS_DIFF4, "charsets/charset/collation/rules/q" }, |
| 217 | {_CS_IDENTICAL, "charsets/charset/collation/rules/i" }, |
| 218 | |
| 219 | /* Rules: expansions */ |
| 220 | {_CS_EXP_X, "charsets/charset/collation/rules/x" }, |
| 221 | {_CS_EXP_EXTEND, "charsets/charset/collation/rules/x/extend" }, |
| 222 | {_CS_EXP_DIFF1, "charsets/charset/collation/rules/x/p" }, |
| 223 | {_CS_EXP_DIFF2, "charsets/charset/collation/rules/x/s" }, |
| 224 | {_CS_EXP_DIFF3, "charsets/charset/collation/rules/x/t" }, |
| 225 | {_CS_EXP_DIFF4, "charsets/charset/collation/rules/x/q" }, |
| 226 | {_CS_EXP_IDENTICAL, "charsets/charset/collation/rules/x/i" }, |
| 227 | |
| 228 | /* Rules: previous context */ |
| 229 | {_CS_CONTEXT, "charsets/charset/collation/rules/x/context" }, |
| 230 | |
| 231 | /* Rules: Abbreviating Ordering Specifications */ |
| 232 | {_CS_A_DIFF1, "charsets/charset/collation/rules/pc" }, |
| 233 | {_CS_A_DIFF2, "charsets/charset/collation/rules/sc" }, |
| 234 | {_CS_A_DIFF3, "charsets/charset/collation/rules/tc" }, |
| 235 | {_CS_A_DIFF4, "charsets/charset/collation/rules/qc" }, |
| 236 | {_CS_A_IDENTICAL, "charsets/charset/collation/rules/ic" }, |
| 237 | |
| 238 | /* Rules: Placing Characters Before Others*/ |
| 239 | {_CS_RESET_BEFORE, "charsets/charset/collation/rules/reset/before" }, |
| 240 | |
| 241 | /* Rules: Logical Reset Positions */ |
| 242 | {_CS_RESET_FIRST_NON_IGNORABLE, "charsets/charset/collation/rules/reset/first_non_ignorable" }, |
| 243 | {_CS_RESET_LAST_NON_IGNORABLE, "charsets/charset/collation/rules/reset/last_non_ignorable" }, |
| 244 | {_CS_RESET_FIRST_PRIMARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_primary_ignorable" }, |
| 245 | {_CS_RESET_LAST_PRIMARY_IGNORABLE, "charsets/charset/collation/rules/reset/last_primary_ignorable" }, |
| 246 | {_CS_RESET_FIRST_SECONDARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_secondary_ignorable" }, |
| 247 | {_CS_RESET_LAST_SECONDARY_IGNORABLE, "charsets/charset/collation/rules/reset/last_secondary_ignorable" }, |
| 248 | {_CS_RESET_FIRST_TERTIARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_tertiary_ignorable" }, |
| 249 | {_CS_RESET_LAST_TERTIARY_IGNORABLE, "charsets/charset/collation/rules/reset/last_tertiary_ignorable" }, |
| 250 | {_CS_RESET_FIRST_TRAILING, "charsets/charset/collation/rules/reset/first_trailing" }, |
| 251 | {_CS_RESET_LAST_TRAILING, "charsets/charset/collation/rules/reset/last_trailing" }, |
| 252 | {_CS_RESET_FIRST_VARIABLE, "charsets/charset/collation/rules/reset/first_variable" }, |
| 253 | {_CS_RESET_LAST_VARIABLE, "charsets/charset/collation/rules/reset/last_variable" }, |
| 254 | |
| 255 | {0, NULL} |
| 256 | }; |
| 257 | |
| 258 | static const struct my_cs_file_section_st |
| 259 | *cs_file_sec(const char *attr, size_t len) |
| 260 | { |
| 261 | const struct my_cs_file_section_st *s; |
| 262 | for (s=sec; s->str; s++) |
| 263 | { |
| 264 | if (!strncmp(attr, s->str, len) && s->str[len] == 0) |
| 265 | return s; |
| 266 | } |
| 267 | return NULL; |
| 268 | } |
| 269 | |
| 270 | #define MY_CS_CSDESCR_SIZE 64 |
| 271 | #define MY_CS_TAILORING_SIZE (32*1024) |
| 272 | #define MY_CS_UCA_VERSION_SIZE 64 |
| 273 | #define MY_CS_CONTEXT_SIZE 64 |
| 274 | |
| 275 | typedef struct my_cs_file_info |
| 276 | { |
| 277 | char csname[MY_CS_NAME_SIZE]; |
| 278 | char name[MY_CS_NAME_SIZE]; |
| 279 | uchar ctype[MY_CS_CTYPE_TABLE_SIZE]; |
| 280 | uchar to_lower[MY_CS_TO_LOWER_TABLE_SIZE]; |
| 281 | uchar to_upper[MY_CS_TO_UPPER_TABLE_SIZE]; |
| 282 | uchar sort_order[MY_CS_SORT_ORDER_TABLE_SIZE]; |
| 283 | uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE]; |
| 284 | char [MY_CS_CSDESCR_SIZE]; |
| 285 | char *tailoring; |
| 286 | size_t tailoring_length; |
| 287 | size_t tailoring_alloced_length; |
| 288 | char context[MY_CS_CONTEXT_SIZE]; |
| 289 | struct charset_info_st cs; |
| 290 | MY_CHARSET_LOADER *loader; |
| 291 | } MY_CHARSET_FILE; |
| 292 | |
| 293 | |
| 294 | static void |
| 295 | my_charset_file_reset_charset(MY_CHARSET_FILE *i) |
| 296 | { |
| 297 | memset(&i->cs, 0, sizeof(i->cs)); |
| 298 | } |
| 299 | |
| 300 | |
| 301 | static void |
| 302 | my_charset_file_reset_collation(MY_CHARSET_FILE *i) |
| 303 | { |
| 304 | i->tailoring_length= 0; |
| 305 | i->context[0]= '\0'; |
| 306 | } |
| 307 | |
| 308 | |
| 309 | static void |
| 310 | my_charset_file_init(MY_CHARSET_FILE *i) |
| 311 | { |
| 312 | my_charset_file_reset_charset(i); |
| 313 | my_charset_file_reset_collation(i); |
| 314 | i->tailoring= NULL; |
| 315 | i->tailoring_alloced_length= 0; |
| 316 | } |
| 317 | |
| 318 | |
| 319 | static void |
| 320 | my_charset_file_free(MY_CHARSET_FILE *i) |
| 321 | { |
| 322 | i->loader->free(i->tailoring); |
| 323 | } |
| 324 | |
| 325 | |
| 326 | static int |
| 327 | my_charset_file_tailoring_realloc(MY_CHARSET_FILE *i, size_t newlen) |
| 328 | { |
| 329 | if (i->tailoring_alloced_length > newlen || |
| 330 | (i->tailoring= i->loader->realloc(i->tailoring, |
| 331 | (i->tailoring_alloced_length= |
| 332 | (newlen + 32*1024))))) |
| 333 | { |
| 334 | return MY_XML_OK; |
| 335 | } |
| 336 | return MY_XML_ERROR; |
| 337 | } |
| 338 | |
| 339 | |
| 340 | static int fill_uchar(uchar *a,uint size,const char *str, size_t len) |
| 341 | { |
| 342 | uint i= 0; |
| 343 | const char *s, *b, *e=str+len; |
| 344 | |
| 345 | for (s=str ; s < e ; i++) |
| 346 | { |
| 347 | for ( ; (s < e) && strchr(" \t\r\n" ,s[0]); s++) ; |
| 348 | b=s; |
| 349 | for ( ; (s < e) && !strchr(" \t\r\n" ,s[0]); s++) ; |
| 350 | if (s == b || i > size) |
| 351 | break; |
| 352 | a[i]= (uchar) strtoul(b,NULL,16); |
| 353 | } |
| 354 | return 0; |
| 355 | } |
| 356 | |
| 357 | static int fill_uint16(uint16 *a,uint size,const char *str, size_t len) |
| 358 | { |
| 359 | uint i= 0; |
| 360 | |
| 361 | const char *s, *b, *e=str+len; |
| 362 | for (s=str ; s < e ; i++) |
| 363 | { |
| 364 | for ( ; (s < e) && strchr(" \t\r\n" ,s[0]); s++) ; |
| 365 | b=s; |
| 366 | for ( ; (s < e) && !strchr(" \t\r\n" ,s[0]); s++) ; |
| 367 | if (s == b || i > size) |
| 368 | break; |
| 369 | a[i]= (uint16) strtol(b,NULL,16); |
| 370 | } |
| 371 | return 0; |
| 372 | } |
| 373 | |
| 374 | |
| 375 | |
| 376 | |
| 377 | static int |
| 378 | tailoring_append(MY_XML_PARSER *st, |
| 379 | const char *fmt, size_t len, const char *attr) |
| 380 | { |
| 381 | struct my_cs_file_info *i= (struct my_cs_file_info *) st->user_data; |
| 382 | size_t newlen= i->tailoring_length + len + 64; /* 64 for format */ |
| 383 | if (MY_XML_OK == my_charset_file_tailoring_realloc(i, newlen)) |
| 384 | { |
| 385 | char *dst= i->tailoring + i->tailoring_length; |
| 386 | sprintf(dst, fmt, (int) len, attr); |
| 387 | i->tailoring_length+= strlen(dst); |
| 388 | return MY_XML_OK; |
| 389 | } |
| 390 | return MY_XML_ERROR; |
| 391 | } |
| 392 | |
| 393 | |
| 394 | static int |
| 395 | tailoring_append2(MY_XML_PARSER *st, |
| 396 | const char *fmt, |
| 397 | size_t len1, const char *attr1, |
| 398 | size_t len2, const char *attr2) |
| 399 | { |
| 400 | struct my_cs_file_info *i= (struct my_cs_file_info *) st->user_data; |
| 401 | size_t newlen= i->tailoring_length + len1 + len2 + 64; /* 64 for format */ |
| 402 | if (MY_XML_OK == my_charset_file_tailoring_realloc(i, newlen)) |
| 403 | { |
| 404 | char *dst= i->tailoring + i->tailoring_length; |
| 405 | sprintf(dst, fmt, (int) len1, attr1, (int) len2, attr2); |
| 406 | i->tailoring_length+= strlen(dst); |
| 407 | return MY_XML_OK; |
| 408 | } |
| 409 | return MY_XML_ERROR; |
| 410 | } |
| 411 | |
| 412 | |
| 413 | static size_t |
| 414 | scan_one_character(const char *s, const char *e, my_wc_t *wc) |
| 415 | { |
| 416 | CHARSET_INFO *cs= &my_charset_utf8_general_ci; |
| 417 | if (s >= e) |
| 418 | return 0; |
| 419 | |
| 420 | /* Escape sequence: \uXXXX */ |
| 421 | if (s[0] == '\\' && s + 2 < e && s[1] == 'u' && my_isxdigit(cs, s[2])) |
| 422 | { |
| 423 | size_t len= 3; /* We have at least one digit */ |
| 424 | for (s+= 3; s < e && my_isxdigit(cs, s[0]); s++, len++) |
| 425 | { |
| 426 | } |
| 427 | wc[0]= 0; |
| 428 | return len; |
| 429 | } |
| 430 | else if ((int8) s[0] > 0) /* 7-bit character */ |
| 431 | { |
| 432 | wc[0]= 0; |
| 433 | return 1; |
| 434 | } |
| 435 | else /* Non-escaped character */ |
| 436 | { |
| 437 | int rc= cs->cset->mb_wc(cs, wc, (uchar *) s, (uchar *) e); |
| 438 | if (rc > 0) |
| 439 | return (size_t) rc; |
| 440 | } |
| 441 | return 0; |
| 442 | } |
| 443 | |
| 444 | |
| 445 | static int |
| 446 | tailoring_append_abbreviation(MY_XML_PARSER *st, |
| 447 | const char *fmt, size_t len, const char *attr) |
| 448 | { |
| 449 | size_t clen; |
| 450 | const char *attrend= attr + len; |
| 451 | my_wc_t wc; |
| 452 | |
| 453 | for ( ; (clen= scan_one_character(attr, attrend, &wc)) > 0; attr+= clen) |
| 454 | { |
| 455 | DBUG_ASSERT(attr < attrend); |
| 456 | if (tailoring_append(st, fmt, clen, attr) != MY_XML_OK) |
| 457 | return MY_XML_ERROR; |
| 458 | } |
| 459 | return MY_XML_OK; |
| 460 | } |
| 461 | |
| 462 | |
| 463 | static int cs_enter(MY_XML_PARSER *st,const char *attr, size_t len) |
| 464 | { |
| 465 | struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data; |
| 466 | const struct my_cs_file_section_st *s= cs_file_sec(attr,len); |
| 467 | int state= s ? s->state : 0; |
| 468 | |
| 469 | switch (state) { |
| 470 | case 0: |
| 471 | i->loader->reporter(WARNING_LEVEL, "Unknown LDML tag: '%.*s'" , len, attr); |
| 472 | break; |
| 473 | |
| 474 | case _CS_CHARSET: |
| 475 | my_charset_file_reset_charset(i); |
| 476 | break; |
| 477 | |
| 478 | case _CS_COLLATION: |
| 479 | my_charset_file_reset_collation(i); |
| 480 | break; |
| 481 | |
| 482 | case _CS_RESET: |
| 483 | return tailoring_append(st, " &" , 0, NULL); |
| 484 | |
| 485 | default: |
| 486 | break; |
| 487 | } |
| 488 | return MY_XML_OK; |
| 489 | } |
| 490 | |
| 491 | |
| 492 | static int cs_leave(MY_XML_PARSER *st,const char *attr, size_t len) |
| 493 | { |
| 494 | struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data; |
| 495 | const struct my_cs_file_section_st *s= cs_file_sec(attr,len); |
| 496 | int state= s ? s->state : 0; |
| 497 | int rc; |
| 498 | |
| 499 | switch(state){ |
| 500 | case _CS_COLLATION: |
| 501 | if (i->tailoring_length) |
| 502 | i->cs.tailoring= i->tailoring; |
| 503 | rc= i->loader->add_collation ? i->loader->add_collation(&i->cs) : MY_XML_OK; |
| 504 | break; |
| 505 | |
| 506 | /* Rules: Logical Reset Positions */ |
| 507 | case _CS_RESET_FIRST_NON_IGNORABLE: |
| 508 | rc= tailoring_append(st, "[first non-ignorable]" , 0, NULL); |
| 509 | break; |
| 510 | |
| 511 | case _CS_RESET_LAST_NON_IGNORABLE: |
| 512 | rc= tailoring_append(st, "[last non-ignorable]" , 0, NULL); |
| 513 | break; |
| 514 | |
| 515 | case _CS_RESET_FIRST_PRIMARY_IGNORABLE: |
| 516 | rc= tailoring_append(st, "[first primary ignorable]" , 0, NULL); |
| 517 | break; |
| 518 | |
| 519 | case _CS_RESET_LAST_PRIMARY_IGNORABLE: |
| 520 | rc= tailoring_append(st, "[last primary ignorable]" , 0, NULL); |
| 521 | break; |
| 522 | |
| 523 | case _CS_RESET_FIRST_SECONDARY_IGNORABLE: |
| 524 | rc= tailoring_append(st, "[first secondary ignorable]" , 0, NULL); |
| 525 | break; |
| 526 | |
| 527 | case _CS_RESET_LAST_SECONDARY_IGNORABLE: |
| 528 | rc= tailoring_append(st, "[last secondary ignorable]" , 0, NULL); |
| 529 | break; |
| 530 | |
| 531 | case _CS_RESET_FIRST_TERTIARY_IGNORABLE: |
| 532 | rc= tailoring_append(st, "[first tertiary ignorable]" , 0, NULL); |
| 533 | break; |
| 534 | |
| 535 | case _CS_RESET_LAST_TERTIARY_IGNORABLE: |
| 536 | rc= tailoring_append(st, "[last tertiary ignorable]" , 0, NULL); |
| 537 | break; |
| 538 | |
| 539 | case _CS_RESET_FIRST_TRAILING: |
| 540 | rc= tailoring_append(st, "[first trailing]" , 0, NULL); |
| 541 | break; |
| 542 | |
| 543 | case _CS_RESET_LAST_TRAILING: |
| 544 | rc= tailoring_append(st, "[last trailing]" , 0, NULL); |
| 545 | break; |
| 546 | |
| 547 | case _CS_RESET_FIRST_VARIABLE: |
| 548 | rc= tailoring_append(st, "[first variable]" , 0, NULL); |
| 549 | break; |
| 550 | |
| 551 | case _CS_RESET_LAST_VARIABLE: |
| 552 | rc= tailoring_append(st, "[last variable]" , 0, NULL); |
| 553 | break; |
| 554 | |
| 555 | default: |
| 556 | rc=MY_XML_OK; |
| 557 | } |
| 558 | return rc; |
| 559 | } |
| 560 | |
| 561 | |
| 562 | static const char *diff_fmt[5]= |
| 563 | { |
| 564 | "<%.*s" , |
| 565 | "<<%.*s" , |
| 566 | "<<<%.*s" , |
| 567 | "<<<<%.*s" , |
| 568 | "=%.*s" |
| 569 | }; |
| 570 | |
| 571 | |
| 572 | static const char *context_diff_fmt[5]= |
| 573 | { |
| 574 | "<%.*s|%.*s" , |
| 575 | "<<%.*s|%.*s" , |
| 576 | "<<<%.*s|%.*s" , |
| 577 | "<<<<%.*s|%.*s" , |
| 578 | "=%.*s|%.*s" |
| 579 | }; |
| 580 | |
| 581 | |
| 582 | static int cs_value(MY_XML_PARSER *st,const char *attr, size_t len) |
| 583 | { |
| 584 | struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data; |
| 585 | const struct my_cs_file_section_st *s; |
| 586 | int state= (int)((s= cs_file_sec(st->attr.start, |
| 587 | st->attr.end - st->attr.start)) ? |
| 588 | s->state : 0); |
| 589 | int rc= MY_XML_OK; |
| 590 | |
| 591 | switch (state) { |
| 592 | case _CS_MISC: |
| 593 | case _CS_FAMILY: |
| 594 | case _CS_ORDER: |
| 595 | break; |
| 596 | case _CS_ID: |
| 597 | i->cs.number= strtol(attr,(char**)NULL,10); |
| 598 | break; |
| 599 | case _CS_BINARY_ID: |
| 600 | i->cs.binary_number= strtol(attr,(char**)NULL,10); |
| 601 | break; |
| 602 | case _CS_PRIMARY_ID: |
| 603 | i->cs.primary_number= strtol(attr,(char**)NULL,10); |
| 604 | break; |
| 605 | case _CS_COLNAME: |
| 606 | i->cs.name=mstr(i->name,attr,len,MY_CS_NAME_SIZE-1); |
| 607 | break; |
| 608 | case _CS_CSNAME: |
| 609 | i->cs.csname=mstr(i->csname,attr,len,MY_CS_NAME_SIZE-1); |
| 610 | break; |
| 611 | case _CS_CSDESCRIPT: |
| 612 | i->cs.comment=mstr(i->comment,attr,len,MY_CS_CSDESCR_SIZE-1); |
| 613 | break; |
| 614 | case _CS_FLAG: |
| 615 | if (!strncmp("primary" ,attr,len)) |
| 616 | i->cs.state|= MY_CS_PRIMARY; |
| 617 | else if (!strncmp("binary" ,attr,len)) |
| 618 | i->cs.state|= MY_CS_BINSORT; |
| 619 | else if (!strncmp("compiled" ,attr,len)) |
| 620 | i->cs.state|= MY_CS_COMPILED; |
| 621 | else if (!strncmp("nopad" ,attr,len)) |
| 622 | i->cs.state|= MY_CS_NOPAD; |
| 623 | break; |
| 624 | case _CS_UPPERMAP: |
| 625 | fill_uchar(i->to_upper,MY_CS_TO_UPPER_TABLE_SIZE,attr,len); |
| 626 | i->cs.to_upper=i->to_upper; |
| 627 | break; |
| 628 | case _CS_LOWERMAP: |
| 629 | fill_uchar(i->to_lower,MY_CS_TO_LOWER_TABLE_SIZE,attr,len); |
| 630 | i->cs.to_lower=i->to_lower; |
| 631 | break; |
| 632 | case _CS_UNIMAP: |
| 633 | fill_uint16(i->tab_to_uni,MY_CS_TO_UNI_TABLE_SIZE,attr,len); |
| 634 | i->cs.tab_to_uni=i->tab_to_uni; |
| 635 | break; |
| 636 | case _CS_COLLMAP: |
| 637 | fill_uchar(i->sort_order,MY_CS_SORT_ORDER_TABLE_SIZE,attr,len); |
| 638 | i->cs.sort_order=i->sort_order; |
| 639 | break; |
| 640 | case _CS_CTYPEMAP: |
| 641 | fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len); |
| 642 | i->cs.ctype=i->ctype; |
| 643 | break; |
| 644 | |
| 645 | /* Special purpose commands */ |
| 646 | case _CS_UCA_VERSION: |
| 647 | rc= tailoring_append(st, "[version %.*s]" , len, attr); |
| 648 | break; |
| 649 | |
| 650 | case _CS_CL_RULES_IMPORT_SOURCE: |
| 651 | rc= tailoring_append(st, "[import %.*s]" , len, attr); |
| 652 | break; |
| 653 | |
| 654 | case _CS_CL_SUPPRESS_CONTRACTIONS: |
| 655 | rc= tailoring_append(st, "[suppress contractions %.*s]" , len, attr); |
| 656 | break; |
| 657 | |
| 658 | case _CS_CL_OPTIMIZE: |
| 659 | rc= tailoring_append(st, "[optimize %.*s]" , len, attr); |
| 660 | break; |
| 661 | |
| 662 | case _CS_CL_SHIFT_AFTER_METHOD: |
| 663 | rc= tailoring_append(st, "[shift-after-method %.*s]" , len, attr); |
| 664 | break; |
| 665 | |
| 666 | /* Collation Settings */ |
| 667 | case _CS_ST_STRENGTH: |
| 668 | /* 1, 2, 3, 4, 5, or primary, secondary, tertiary, quaternary, identical */ |
| 669 | rc= tailoring_append(st, "[strength %.*s]" , len, attr); |
| 670 | if (len && attr[0] >= '1' && attr[0] <= '9') |
| 671 | i->cs.levels_for_order= attr[0] - '0'; |
| 672 | break; |
| 673 | |
| 674 | case _CS_ST_ALTERNATE: |
| 675 | /* non-ignorable, shifted */ |
| 676 | rc= tailoring_append(st, "[alternate %.*s]" , len, attr); |
| 677 | break; |
| 678 | |
| 679 | case _CS_ST_BACKWARDS: |
| 680 | /* on, off, 2 */ |
| 681 | rc= tailoring_append(st, "[backwards %.*s]" , len, attr); |
| 682 | break; |
| 683 | |
| 684 | case _CS_ST_NORMALIZATION: |
| 685 | /* |
| 686 | TODO for WL#896: check collations for normalization: vi.xml |
| 687 | We want precomposed characters work well at this point. |
| 688 | */ |
| 689 | /* on, off */ |
| 690 | rc= tailoring_append(st, "[normalization %.*s]" , len, attr); |
| 691 | break; |
| 692 | |
| 693 | case _CS_ST_CASE_LEVEL: |
| 694 | /* on, off */ |
| 695 | rc= tailoring_append(st, "[caseLevel %.*s]" , len, attr); |
| 696 | break; |
| 697 | |
| 698 | case _CS_ST_CASE_FIRST: |
| 699 | /* upper, lower, off */ |
| 700 | rc= tailoring_append(st, "[caseFirst %.*s]" , len, attr); |
| 701 | break; |
| 702 | |
| 703 | case _CS_ST_HIRAGANA_QUATERNARY: |
| 704 | /* on, off */ |
| 705 | rc= tailoring_append(st, "[hiraganaQ %.*s]" , len, attr); |
| 706 | break; |
| 707 | |
| 708 | case _CS_ST_NUMERIC: |
| 709 | /* on, off */ |
| 710 | rc= tailoring_append(st, "[numeric %.*s]" , len, attr); |
| 711 | break; |
| 712 | |
| 713 | case _CS_ST_VARIABLE_TOP: |
| 714 | /* TODO for WL#896: check value format */ |
| 715 | rc= tailoring_append(st, "[variableTop %.*s]" , len, attr); |
| 716 | break; |
| 717 | |
| 718 | case _CS_ST_MATCH_BOUNDARIES: |
| 719 | /* none, whole-character, whole-word */ |
| 720 | rc= tailoring_append(st, "[match-boundaries %.*s]" , len, attr); |
| 721 | break; |
| 722 | |
| 723 | case _CS_ST_MATCH_STYLE: |
| 724 | /* minimal, medial, maximal */ |
| 725 | rc= tailoring_append(st, "[match-style %.*s]" , len, attr); |
| 726 | break; |
| 727 | |
| 728 | |
| 729 | /* Rules */ |
| 730 | case _CS_RESET: |
| 731 | rc= tailoring_append(st, "%.*s" , len, attr); |
| 732 | break; |
| 733 | |
| 734 | case _CS_DIFF1: |
| 735 | case _CS_DIFF2: |
| 736 | case _CS_DIFF3: |
| 737 | case _CS_DIFF4: |
| 738 | case _CS_IDENTICAL: |
| 739 | rc= tailoring_append(st, diff_fmt[state - _CS_DIFF1], len, attr); |
| 740 | break; |
| 741 | |
| 742 | |
| 743 | /* Rules: Expansion */ |
| 744 | case _CS_EXP_EXTEND: |
| 745 | rc= tailoring_append(st, " / %.*s" , len, attr); |
| 746 | break; |
| 747 | |
| 748 | case _CS_EXP_DIFF1: |
| 749 | case _CS_EXP_DIFF2: |
| 750 | case _CS_EXP_DIFF3: |
| 751 | case _CS_EXP_DIFF4: |
| 752 | case _CS_EXP_IDENTICAL: |
| 753 | if (i->context[0]) |
| 754 | { |
| 755 | rc= tailoring_append2(st, context_diff_fmt[state - _CS_EXP_DIFF1], |
| 756 | strlen(i->context), i->context, len, attr); |
| 757 | i->context[0]= 0; |
| 758 | } |
| 759 | else |
| 760 | rc= tailoring_append(st, diff_fmt[state - _CS_EXP_DIFF1], len, attr); |
| 761 | break; |
| 762 | |
| 763 | /* Rules: Context */ |
| 764 | case _CS_CONTEXT: |
| 765 | if (len < sizeof(i->context)) |
| 766 | { |
| 767 | memcpy(i->context, attr, len); |
| 768 | i->context[len]= '\0'; |
| 769 | } |
| 770 | break; |
| 771 | |
| 772 | /* Rules: Abbreviating Ordering Specifications */ |
| 773 | case _CS_A_DIFF1: |
| 774 | case _CS_A_DIFF2: |
| 775 | case _CS_A_DIFF3: |
| 776 | case _CS_A_DIFF4: |
| 777 | case _CS_A_IDENTICAL: |
| 778 | rc= tailoring_append_abbreviation(st, diff_fmt[state - _CS_A_DIFF1], len, attr); |
| 779 | break; |
| 780 | |
| 781 | /* Rules: Placing Characters Before Others */ |
| 782 | case _CS_RESET_BEFORE: |
| 783 | /* |
| 784 | TODO for WL#896: Add this check into text customization parser: |
| 785 | It is an error if the strength of the before relation is not identical |
| 786 | to the relation after the reset. We'll need this for WL#896. |
| 787 | */ |
| 788 | rc= tailoring_append(st, "[before %.*s]" , len, attr); |
| 789 | break; |
| 790 | |
| 791 | |
| 792 | default: |
| 793 | break; |
| 794 | } |
| 795 | |
| 796 | return rc; |
| 797 | } |
| 798 | |
| 799 | |
| 800 | my_bool |
| 801 | my_parse_charset_xml(MY_CHARSET_LOADER *loader, const char *buf, size_t len) |
| 802 | { |
| 803 | MY_XML_PARSER p; |
| 804 | struct my_cs_file_info info; |
| 805 | my_bool rc; |
| 806 | |
| 807 | my_charset_file_init(&info); |
| 808 | my_xml_parser_create(&p); |
| 809 | my_xml_set_enter_handler(&p,cs_enter); |
| 810 | my_xml_set_value_handler(&p,cs_value); |
| 811 | my_xml_set_leave_handler(&p,cs_leave); |
| 812 | info.loader= loader; |
| 813 | my_xml_set_user_data(&p, (void *) &info); |
| 814 | rc= (my_xml_parse(&p,buf,len) == MY_XML_OK) ? FALSE : TRUE; |
| 815 | my_xml_parser_free(&p); |
| 816 | my_charset_file_free(&info); |
| 817 | if (rc != MY_XML_OK) |
| 818 | { |
| 819 | const char *errstr= my_xml_error_string(&p); |
| 820 | if (sizeof(loader->error) > 32 + strlen(errstr)) |
| 821 | { |
| 822 | /* We cannot use my_snprintf() here. See previous comment. */ |
| 823 | sprintf(loader->error, "at line %d pos %d: %s" , |
| 824 | my_xml_error_lineno(&p)+1, |
| 825 | (int) my_xml_error_pos(&p), |
| 826 | my_xml_error_string(&p)); |
| 827 | } |
| 828 | } |
| 829 | return rc; |
| 830 | } |
| 831 | |
| 832 | |
| 833 | uint |
| 834 | my_string_repertoire_8bit(CHARSET_INFO *cs, const char *str, size_t length) |
| 835 | { |
| 836 | const char *strend; |
| 837 | if ((cs->state & MY_CS_NONASCII) && length > 0) |
| 838 | return MY_REPERTOIRE_UNICODE30; |
| 839 | for (strend= str + length; str < strend; str++) |
| 840 | { |
| 841 | if (((uchar) *str) > 0x7F) |
| 842 | return MY_REPERTOIRE_UNICODE30; |
| 843 | } |
| 844 | return MY_REPERTOIRE_ASCII; |
| 845 | } |
| 846 | |
| 847 | |
| 848 | static void |
| 849 | my_string_metadata_init(MY_STRING_METADATA *metadata) |
| 850 | { |
| 851 | metadata->repertoire= MY_REPERTOIRE_ASCII; |
| 852 | metadata->char_length= 0; |
| 853 | } |
| 854 | |
| 855 | |
| 856 | /** |
| 857 | This should probably eventually go as a virtual function into |
| 858 | MY_CHARSET_HANDLER or MY_COLLATION_HANDLER. |
| 859 | */ |
| 860 | static void |
| 861 | my_string_metadata_get_mb(MY_STRING_METADATA *metadata, |
| 862 | CHARSET_INFO *cs, const char *str, ulong length) |
| 863 | { |
| 864 | const char *strend= str + length; |
| 865 | for (my_string_metadata_init(metadata) ; |
| 866 | str < strend; |
| 867 | metadata->char_length++) |
| 868 | { |
| 869 | my_wc_t wc; |
| 870 | int mblen= cs->cset->mb_wc(cs, &wc, (const uchar *) str, |
| 871 | (const uchar *) strend); |
| 872 | if (mblen > 0) /* Assigned character */ |
| 873 | { |
| 874 | if (wc > 0x7F) |
| 875 | metadata->repertoire|= MY_REPERTOIRE_EXTENDED; |
| 876 | str+= mblen; |
| 877 | } |
| 878 | else if (mblen == MY_CS_ILSEQ) /* Bad byte sequence */ |
| 879 | { |
| 880 | metadata->repertoire|= MY_REPERTOIRE_EXTENDED; |
| 881 | str++; |
| 882 | } |
| 883 | else if (mblen > MY_CS_TOOSMALL) /* Unassigned character */ |
| 884 | { |
| 885 | metadata->repertoire|= MY_REPERTOIRE_EXTENDED; |
| 886 | str+= (-mblen); |
| 887 | } |
| 888 | else /* Incomplete character, premature end-of-line */ |
| 889 | { |
| 890 | metadata->repertoire|= MY_REPERTOIRE_EXTENDED; /* Just in case */ |
| 891 | break; |
| 892 | } |
| 893 | } |
| 894 | } |
| 895 | |
| 896 | |
| 897 | /** |
| 898 | Collect string metadata: length in characters and repertoire. |
| 899 | */ |
| 900 | void |
| 901 | my_string_metadata_get(MY_STRING_METADATA *metadata, |
| 902 | CHARSET_INFO *cs, const char *str, size_t length) |
| 903 | { |
| 904 | if (cs->mbmaxlen == 1 && !(cs->state & MY_CS_NONASCII)) |
| 905 | { |
| 906 | metadata->char_length= length; |
| 907 | metadata->repertoire= my_string_repertoire_8bit(cs, str, (ulong)length); |
| 908 | } |
| 909 | else |
| 910 | { |
| 911 | my_string_metadata_get_mb(metadata, cs, str, (ulong)length); |
| 912 | } |
| 913 | } |
| 914 | |
| 915 | |
| 916 | /* |
| 917 | Check repertoire: detect pure ascii strings |
| 918 | */ |
| 919 | uint |
| 920 | my_string_repertoire(CHARSET_INFO *cs, const char *str, size_t length) |
| 921 | { |
| 922 | if (cs->mbminlen == 1 && !(cs->state & MY_CS_NONASCII)) |
| 923 | { |
| 924 | return my_string_repertoire_8bit(cs, str, length); |
| 925 | } |
| 926 | else |
| 927 | { |
| 928 | const char *strend= str + length; |
| 929 | my_wc_t wc; |
| 930 | int chlen; |
| 931 | for (; |
| 932 | (chlen= cs->cset->mb_wc(cs, &wc, (uchar*) str, (uchar*) strend)) > 0; |
| 933 | str+= chlen) |
| 934 | { |
| 935 | if (wc > 0x7F) |
| 936 | return MY_REPERTOIRE_UNICODE30; |
| 937 | } |
| 938 | } |
| 939 | return MY_REPERTOIRE_ASCII; |
| 940 | } |
| 941 | |
| 942 | |
| 943 | /* |
| 944 | Returns repertoire for charset |
| 945 | */ |
| 946 | uint my_charset_repertoire(CHARSET_INFO *cs) |
| 947 | { |
| 948 | return cs->state & MY_CS_PUREASCII ? |
| 949 | MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30; |
| 950 | } |
| 951 | |
| 952 | |
| 953 | /* |
| 954 | Detect whether a character set is ASCII compatible. |
| 955 | |
| 956 | Returns TRUE for: |
| 957 | |
| 958 | - all 8bit character sets whose Unicode mapping of 0x7B is '{' |
| 959 | (ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS") |
| 960 | |
| 961 | - all multi-byte character sets having mbminlen == 1 |
| 962 | (ignores ucs2 whose mbminlen is 2) |
| 963 | |
| 964 | TODO: |
| 965 | |
| 966 | When merging to 5.2, this function should be changed |
| 967 | to check a new flag MY_CS_NONASCII, |
| 968 | |
| 969 | return (cs->flag & MY_CS_NONASCII) ? 0 : 1; |
| 970 | |
| 971 | This flag was previously added into 5.2 under terms |
| 972 | of WL#3759 "Optimize identifier conversion in client-server protocol" |
| 973 | especially to mark character sets not compatible with ASCII. |
| 974 | |
| 975 | We won't backport this flag to 5.0 or 5.1. |
| 976 | This function is Ok for 5.0 and 5.1, because we're not going |
| 977 | to introduce new tricky character sets between 5.0 and 5.2. |
| 978 | */ |
| 979 | my_bool |
| 980 | my_charset_is_ascii_based(CHARSET_INFO *cs) |
| 981 | { |
| 982 | return |
| 983 | (cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') || |
| 984 | (cs->mbminlen == 1 && cs->mbmaxlen > 1); |
| 985 | } |
| 986 | |
| 987 | |
| 988 | /* |
| 989 | Convert a string between two character sets. |
| 990 | 'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes. |
| 991 | |
| 992 | @param to[OUT] Store result here |
| 993 | @param to_length Size of "to" buffer |
| 994 | @param to_cs Character set of result string |
| 995 | @param from Copy from here |
| 996 | @param from_length Length of the "from" string |
| 997 | @param from_cs Character set of the "from" string |
| 998 | @param errors[OUT] Number of conversion errors |
| 999 | |
| 1000 | @return Number of bytes copied to 'to' string |
| 1001 | */ |
| 1002 | |
| 1003 | uint32 |
| 1004 | my_convert_using_func(char *to, size_t to_length, |
| 1005 | CHARSET_INFO *to_cs, my_charset_conv_wc_mb wc_mb, |
| 1006 | const char *from, size_t from_length, |
| 1007 | CHARSET_INFO *from_cs, my_charset_conv_mb_wc mb_wc, |
| 1008 | uint *errors) |
| 1009 | { |
| 1010 | int cnvres; |
| 1011 | my_wc_t wc; |
| 1012 | const uchar *from_end= (const uchar*) from + from_length; |
| 1013 | char *to_start= to; |
| 1014 | uchar *to_end= (uchar*) to + to_length; |
| 1015 | uint error_count= 0; |
| 1016 | |
| 1017 | while (1) |
| 1018 | { |
| 1019 | if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0) |
| 1020 | from+= cnvres; |
| 1021 | else if (cnvres == MY_CS_ILSEQ) |
| 1022 | { |
| 1023 | error_count++; |
| 1024 | from++; |
| 1025 | wc= '?'; |
| 1026 | } |
| 1027 | else if (cnvres > MY_CS_TOOSMALL) |
| 1028 | { |
| 1029 | /* |
| 1030 | A correct multibyte sequence detected |
| 1031 | But it doesn't have Unicode mapping. |
| 1032 | */ |
| 1033 | error_count++; |
| 1034 | from+= (-cnvres); |
| 1035 | wc= '?'; |
| 1036 | } |
| 1037 | else |
| 1038 | { |
| 1039 | if ((uchar *) from >= from_end) |
| 1040 | break; /* End of line */ |
| 1041 | /* Incomplete byte sequence */ |
| 1042 | error_count++; |
| 1043 | from++; |
| 1044 | wc= '?'; |
| 1045 | } |
| 1046 | |
| 1047 | outp: |
| 1048 | if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0) |
| 1049 | to+= cnvres; |
| 1050 | else if (cnvres == MY_CS_ILUNI && wc != '?') |
| 1051 | { |
| 1052 | error_count++; |
| 1053 | wc= '?'; |
| 1054 | goto outp; |
| 1055 | } |
| 1056 | else |
| 1057 | break; |
| 1058 | } |
| 1059 | *errors= error_count; |
| 1060 | return (uint32) (to - to_start); |
| 1061 | } |
| 1062 | |
| 1063 | |
| 1064 | /* |
| 1065 | Convert a string between two character sets. |
| 1066 | Optimized for quick copying of ASCII characters in the range 0x00..0x7F. |
| 1067 | 'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes. |
| 1068 | |
| 1069 | @param to[OUT] Store result here |
| 1070 | @param to_length Size of "to" buffer |
| 1071 | @param to_cs Character set of result string |
| 1072 | @param from Copy from here |
| 1073 | @param from_length Length of the "from" string |
| 1074 | @param from_cs Character set of the "from" string |
| 1075 | @param errors[OUT] Number of conversion errors |
| 1076 | |
| 1077 | @return Number of bytes copied to 'to' string |
| 1078 | */ |
| 1079 | |
| 1080 | uint32 |
| 1081 | my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, |
| 1082 | const char *from, uint32 from_length, |
| 1083 | CHARSET_INFO *from_cs, uint *errors) |
| 1084 | { |
| 1085 | uint32 length, length2; |
| 1086 | /* |
| 1087 | If any of the character sets is not ASCII compatible, |
| 1088 | immediately switch to slow mb_wc->wc_mb method. |
| 1089 | */ |
| 1090 | if ((to_cs->state | from_cs->state) & MY_CS_NONASCII) |
| 1091 | return my_convert_using_func(to, to_length, |
| 1092 | to_cs, to_cs->cset->wc_mb, |
| 1093 | from, from_length, |
| 1094 | from_cs, from_cs->cset->mb_wc, |
| 1095 | errors); |
| 1096 | |
| 1097 | length= length2= MY_MIN(to_length, from_length); |
| 1098 | |
| 1099 | #if defined(__i386__) || defined(__x86_64__) |
| 1100 | /* |
| 1101 | Special loop for i386, it allows to refer to a |
| 1102 | non-aligned memory block as UINT32, which makes |
| 1103 | it possible to copy four bytes at once. This |
| 1104 | gives about 10% performance improvement comparing |
| 1105 | to byte-by-byte loop. |
| 1106 | */ |
| 1107 | for ( ; length >= 4; length-= 4, from+= 4, to+= 4) |
| 1108 | { |
| 1109 | if ((*(uint32*)from) & 0x80808080) |
| 1110 | break; |
| 1111 | *((uint32*) to)= *((const uint32*) from); |
| 1112 | } |
| 1113 | #endif /* __i386__ */ |
| 1114 | |
| 1115 | for (; ; *to++= *from++, length--) |
| 1116 | { |
| 1117 | if (!length) |
| 1118 | { |
| 1119 | *errors= 0; |
| 1120 | return length2; |
| 1121 | } |
| 1122 | if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */ |
| 1123 | { |
| 1124 | uint32 copied_length= length2 - length; |
| 1125 | to_length-= copied_length; |
| 1126 | from_length-= copied_length; |
| 1127 | return copied_length + my_convert_using_func(to, to_length, to_cs, |
| 1128 | to_cs->cset->wc_mb, |
| 1129 | from, from_length, from_cs, |
| 1130 | from_cs->cset->mb_wc, |
| 1131 | errors); |
| 1132 | } |
| 1133 | } |
| 1134 | |
| 1135 | DBUG_ASSERT(FALSE); // Should never get to here |
| 1136 | return 0; // Make compiler happy |
| 1137 | } |
| 1138 | |
| 1139 | |
| 1140 | size_t |
| 1141 | my_convert_fix(CHARSET_INFO *to_cs, char *to, size_t to_length, |
| 1142 | CHARSET_INFO *from_cs, const char *from, size_t from_length, |
| 1143 | size_t nchars, |
| 1144 | MY_STRCOPY_STATUS *copy_status, |
| 1145 | MY_STRCONV_STATUS *conv_status) |
| 1146 | { |
| 1147 | int cnvres; |
| 1148 | my_wc_t wc; |
| 1149 | my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc; |
| 1150 | my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb; |
| 1151 | const uchar *from_end= (const uchar*) from + from_length; |
| 1152 | uchar *to_end= (uchar*) to + to_length; |
| 1153 | char *to_start= to; |
| 1154 | |
| 1155 | DBUG_ASSERT(to_cs != &my_charset_bin); |
| 1156 | DBUG_ASSERT(from_cs != &my_charset_bin); |
| 1157 | |
| 1158 | copy_status->m_well_formed_error_pos= NULL; |
| 1159 | conv_status->m_cannot_convert_error_pos= NULL; |
| 1160 | |
| 1161 | for ( ; nchars; nchars--) |
| 1162 | { |
| 1163 | const char *from_prev= from; |
| 1164 | if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0) |
| 1165 | from+= cnvres; |
| 1166 | else if (cnvres == MY_CS_ILSEQ) |
| 1167 | { |
| 1168 | if (!copy_status->m_well_formed_error_pos) |
| 1169 | copy_status->m_well_formed_error_pos= from; |
| 1170 | from++; |
| 1171 | wc= '?'; |
| 1172 | } |
| 1173 | else if (cnvres > MY_CS_TOOSMALL) |
| 1174 | { |
| 1175 | /* |
| 1176 | A correct multibyte sequence detected |
| 1177 | But it doesn't have Unicode mapping. |
| 1178 | */ |
| 1179 | if (!conv_status->m_cannot_convert_error_pos) |
| 1180 | conv_status->m_cannot_convert_error_pos= from; |
| 1181 | from+= (-cnvres); |
| 1182 | wc= '?'; |
| 1183 | } |
| 1184 | else |
| 1185 | { |
| 1186 | if ((uchar *) from >= from_end) |
| 1187 | break; // End of line |
| 1188 | // Incomplete byte sequence |
| 1189 | if (!copy_status->m_well_formed_error_pos) |
| 1190 | copy_status->m_well_formed_error_pos= from; |
| 1191 | from++; |
| 1192 | wc= '?'; |
| 1193 | } |
| 1194 | outp: |
| 1195 | if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0) |
| 1196 | to+= cnvres; |
| 1197 | else if (cnvres == MY_CS_ILUNI && wc != '?') |
| 1198 | { |
| 1199 | if (!conv_status->m_cannot_convert_error_pos) |
| 1200 | conv_status->m_cannot_convert_error_pos= from_prev; |
| 1201 | wc= '?'; |
| 1202 | goto outp; |
| 1203 | } |
| 1204 | else |
| 1205 | { |
| 1206 | from= from_prev; |
| 1207 | break; |
| 1208 | } |
| 1209 | } |
| 1210 | copy_status->m_source_end_pos= from; |
| 1211 | return to - to_start; |
| 1212 | } |
| 1213 | |