1/*
2 Copyright (c) 2000, 2011, Oracle and/or its affiliates
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
16
17#include "mysys_priv.h"
18#include "mysys_err.h"
19#include <m_ctype.h>
20#include <m_string.h>
21#include <my_dir.h>
22#include <my_xml.h>
23
24
25/*
26 The code below implements this functionality:
27
28 - Initializing charset related structures
29 - Loading dynamic charsets
30 - Searching for a proper CHARSET_INFO
31 using charset name, collation name or collation ID
32 - Setting server default character set
33*/
34
35my_bool my_charset_same(CHARSET_INFO *cs1, CHARSET_INFO *cs2)
36{
37 return ((cs1 == cs2) || !strcmp(cs1->csname,cs2->csname));
38}
39
40
41static uint
42get_collation_number_internal(const char *name)
43{
44 CHARSET_INFO **cs;
45 for (cs= all_charsets;
46 cs < all_charsets + array_elements(all_charsets);
47 cs++)
48 {
49 if ( cs[0] && cs[0]->name &&
50 !my_strcasecmp(&my_charset_latin1, cs[0]->name, name))
51 return cs[0]->number;
52 }
53 return 0;
54}
55
56
57static my_bool is_multi_byte_ident(CHARSET_INFO *cs, uchar ch)
58{
59 int chlen= my_charlen(cs, (const char *) &ch, (const char *) &ch + 1);
60 return MY_CS_IS_TOOSMALL(chlen) ? TRUE : FALSE;
61}
62
63static my_bool init_state_maps(struct charset_info_st *cs)
64{
65 uint i;
66 uchar *state_map;
67 uchar *ident_map;
68
69 if (!(cs->state_map= state_map= (uchar*) my_once_alloc(256, MYF(MY_WME))))
70 return 1;
71
72 if (!(cs->ident_map= ident_map= (uchar*) my_once_alloc(256, MYF(MY_WME))))
73 return 1;
74
75 /* Fill state_map with states to get a faster parser */
76 for (i=0; i < 256 ; i++)
77 {
78 if (my_isalpha(cs,i))
79 state_map[i]=(uchar) MY_LEX_IDENT;
80 else if (my_isdigit(cs,i))
81 state_map[i]=(uchar) MY_LEX_NUMBER_IDENT;
82 else if (is_multi_byte_ident(cs, i))
83 state_map[i]=(uchar) MY_LEX_IDENT;
84 else if (my_isspace(cs,i))
85 state_map[i]=(uchar) MY_LEX_SKIP;
86 else
87 state_map[i]=(uchar) MY_LEX_CHAR;
88 }
89 state_map[(uchar)'_']=state_map[(uchar)'$']=(uchar) MY_LEX_IDENT;
90 state_map[(uchar)'\'']=(uchar) MY_LEX_STRING;
91 state_map[(uchar)'.']=(uchar) MY_LEX_REAL_OR_POINT;
92 state_map[(uchar)'>']=state_map[(uchar)'=']=state_map[(uchar)'!']= (uchar) MY_LEX_CMP_OP;
93 state_map[(uchar)'<']= (uchar) MY_LEX_LONG_CMP_OP;
94 state_map[(uchar)'&']=state_map[(uchar)'|']=(uchar) MY_LEX_BOOL;
95 state_map[(uchar)'#']=(uchar) MY_LEX_COMMENT;
96 state_map[(uchar)';']=(uchar) MY_LEX_SEMICOLON;
97 state_map[(uchar)':']=(uchar) MY_LEX_SET_VAR;
98 state_map[0]=(uchar) MY_LEX_EOL;
99 state_map[(uchar)'\\']= (uchar) MY_LEX_ESCAPE;
100 state_map[(uchar)'/']= (uchar) MY_LEX_LONG_COMMENT;
101 state_map[(uchar)'*']= (uchar) MY_LEX_END_LONG_COMMENT;
102 state_map[(uchar)'@']= (uchar) MY_LEX_USER_END;
103 state_map[(uchar) '`']= (uchar) MY_LEX_USER_VARIABLE_DELIMITER;
104 state_map[(uchar)'"']= (uchar) MY_LEX_STRING_OR_DELIMITER;
105 state_map[(uchar)'-']= (uchar) MY_LEX_MINUS_OR_COMMENT;
106 state_map[(uchar)',']= (uchar) MY_LEX_COMMA;
107 state_map[(uchar)'?']= (uchar) MY_LEX_PLACEHOLDER;
108
109 /*
110 Create a second map to make it faster to find identifiers
111 */
112 for (i=0; i < 256 ; i++)
113 {
114 ident_map[i]= (uchar) (state_map[i] == MY_LEX_IDENT ||
115 state_map[i] == MY_LEX_NUMBER_IDENT);
116 }
117
118 /* Special handling of hex and binary strings */
119 state_map[(uchar)'x']= state_map[(uchar)'X']= (uchar) MY_LEX_IDENT_OR_HEX;
120 state_map[(uchar)'b']= state_map[(uchar)'B']= (uchar) MY_LEX_IDENT_OR_BIN;
121 state_map[(uchar)'n']= state_map[(uchar)'N']= (uchar) MY_LEX_IDENT_OR_NCHAR;
122 return 0;
123}
124
125
126static MY_COLLATION_HANDLER *get_simple_collation_handler_by_flags(uint flags)
127{
128 return flags & MY_CS_BINSORT ?
129 (flags & MY_CS_NOPAD ?
130 &my_collation_8bit_nopad_bin_handler :
131 &my_collation_8bit_bin_handler) :
132 (flags & MY_CS_NOPAD ?
133 &my_collation_8bit_simple_nopad_ci_handler :
134 &my_collation_8bit_simple_ci_handler);
135}
136
137
138static void simple_cs_init_functions(struct charset_info_st *cs)
139{
140 cs->coll= get_simple_collation_handler_by_flags(cs->state);
141 cs->cset= &my_charset_8bit_handler;
142}
143
144
145
146static int cs_copy_data(struct charset_info_st *to, CHARSET_INFO *from)
147{
148 to->number= from->number ? from->number : to->number;
149
150 if (from->csname)
151 if (!(to->csname= my_once_strdup(from->csname,MYF(MY_WME))))
152 goto err;
153
154 if (from->name)
155 if (!(to->name= my_once_strdup(from->name,MYF(MY_WME))))
156 goto err;
157
158 if (from->comment)
159 if (!(to->comment= my_once_strdup(from->comment,MYF(MY_WME))))
160 goto err;
161
162 if (from->ctype)
163 {
164 if (!(to->ctype= (uchar*) my_once_memdup((char*) from->ctype,
165 MY_CS_CTYPE_TABLE_SIZE,
166 MYF(MY_WME))))
167 goto err;
168 if (init_state_maps(to))
169 goto err;
170 }
171 if (from->to_lower)
172 if (!(to->to_lower= (uchar*) my_once_memdup((char*) from->to_lower,
173 MY_CS_TO_LOWER_TABLE_SIZE,
174 MYF(MY_WME))))
175 goto err;
176
177 if (from->to_upper)
178 if (!(to->to_upper= (uchar*) my_once_memdup((char*) from->to_upper,
179 MY_CS_TO_UPPER_TABLE_SIZE,
180 MYF(MY_WME))))
181 goto err;
182 if (from->sort_order)
183 {
184 if (!(to->sort_order= (uchar*) my_once_memdup((char*) from->sort_order,
185 MY_CS_SORT_ORDER_TABLE_SIZE,
186 MYF(MY_WME))))
187 goto err;
188
189 }
190 if (from->tab_to_uni)
191 {
192 uint sz= MY_CS_TO_UNI_TABLE_SIZE*sizeof(uint16);
193 if (!(to->tab_to_uni= (uint16*) my_once_memdup((char*)from->tab_to_uni,
194 sz, MYF(MY_WME))))
195 goto err;
196 }
197 if (from->tailoring)
198 if (!(to->tailoring= my_once_strdup(from->tailoring,MYF(MY_WME))))
199 goto err;
200
201 return 0;
202
203err:
204 return 1;
205}
206
207
208static my_bool simple_8bit_charset_data_is_full(CHARSET_INFO *cs)
209{
210 return cs->ctype && cs->to_upper && cs->to_lower && cs->tab_to_uni;
211}
212
213
214/**
215 Inherit missing 8bit charset data from another collation.
216 Arrays pointed by refcs must be in the permanent memory already,
217 e.g. static memory, or allocated by my_once_xxx().
218*/
219static void
220inherit_charset_data(struct charset_info_st *cs, CHARSET_INFO *refcs)
221{
222 if (!cs->to_upper)
223 cs->to_upper= refcs->to_upper;
224 if (!cs->to_lower)
225 cs->to_lower= refcs->to_lower;
226 if (!cs->ctype)
227 cs->ctype= refcs->ctype;
228 if (!cs->tab_to_uni)
229 cs->tab_to_uni= refcs->tab_to_uni;
230}
231
232
233static my_bool simple_8bit_collation_data_is_full(CHARSET_INFO *cs)
234{
235 return cs->sort_order || (cs->state & MY_CS_BINSORT);
236}
237
238
239/**
240 Inherit 8bit simple collation data from another collation.
241 refcs->sort_order must be in the permanent memory already,
242 e.g. static memory, or allocated by my_once_xxx().
243*/
244static void
245inherit_collation_data(struct charset_info_st *cs, CHARSET_INFO *refcs)
246{
247 if (!simple_8bit_collation_data_is_full(cs))
248 cs->sort_order= refcs->sort_order;
249}
250
251
252static my_bool simple_cs_is_full(CHARSET_INFO *cs)
253{
254 return cs->number && cs->csname && cs->name &&
255 simple_8bit_charset_data_is_full(cs) &&
256 (simple_8bit_collation_data_is_full(cs) || cs->tailoring);
257}
258
259
260#if defined(HAVE_UCA_COLLATIONS) && (defined(HAVE_CHARSET_ucs2) || defined(HAVE_CHARSET_utf8))
261/**
262 Initialize a loaded collation.
263 @param [OUT] to - The new charset_info_st structure to initialize.
264 @param [IN] from - A template collation, to fill the missing data from.
265 @param [IN] loaded - The collation data loaded from the LDML file.
266 some data may be missing in "loaded".
267*/
268static void
269copy_uca_collation(struct charset_info_st *to, CHARSET_INFO *from,
270 CHARSET_INFO *loaded)
271{
272 to->cset= from->cset;
273 to->coll= from->coll;
274 /*
275 Single-level UCA collation have strnxfrm_multiple=8.
276 In case of a multi-level UCA collation we use strnxfrm_multiply=4.
277 That means MY_COLLATION_HANDLER::strnfrmlen() will request the caller
278 to allocate a buffer smaller size for each level, for performance purpose,
279 and to fit longer VARCHARs to @@max_sort_length.
280 This makes filesort produce non-precise order for some rare Unicode
281 characters that produce more than 4 weights (long expansions).
282 UCA requires 2 bytes per weight multiplied by the number of levels.
283 In case of a 2-level collation, each character requires 4*2=8 bytes.
284 Therefore, the longest VARCHAR that fits into the default @@max_sort_length
285 is 1024/8=VARCHAR(128). With strnxfrm_multiply==8, only VARCHAR(64)
286 would fit.
287 Note, the built-in collation utf8_thai_520_w2 also uses strnxfrm_multiply=4,
288 for the same purpose.
289 TODO: we could add a new LDML syntax to choose strxfrm_multiply value.
290 */
291 to->strxfrm_multiply= loaded->levels_for_order > 1 ?
292 4 : from->strxfrm_multiply;
293 to->min_sort_char= from->min_sort_char;
294 to->max_sort_char= from->max_sort_char;
295 to->mbminlen= from->mbminlen;
296 to->mbmaxlen= from->mbmaxlen;
297 to->caseup_multiply= from->caseup_multiply;
298 to->casedn_multiply= from->casedn_multiply;
299 to->state|= MY_CS_AVAILABLE | MY_CS_LOADED |
300 MY_CS_STRNXFRM | MY_CS_UNICODE;
301}
302#endif
303
304
305static int add_collation(struct charset_info_st *cs)
306{
307 if (cs->name && (cs->number ||
308 (cs->number=get_collation_number_internal(cs->name))) &&
309 cs->number < array_elements(all_charsets))
310 {
311 struct charset_info_st *newcs;
312 if (!(newcs= (struct charset_info_st*) all_charsets[cs->number]))
313 {
314 if (!(all_charsets[cs->number]= newcs=
315 (struct charset_info_st*) my_once_alloc(sizeof(CHARSET_INFO),MYF(0))))
316 return MY_XML_ERROR;
317 bzero(newcs,sizeof(CHARSET_INFO));
318 }
319
320 if (cs->primary_number == cs->number)
321 cs->state |= MY_CS_PRIMARY;
322
323 if (cs->binary_number == cs->number)
324 cs->state |= MY_CS_BINSORT;
325
326 newcs->state|= cs->state;
327
328 if (!(newcs->state & MY_CS_COMPILED))
329 {
330 if (cs_copy_data(newcs,cs))
331 return MY_XML_ERROR;
332
333 newcs->caseup_multiply= newcs->casedn_multiply= 1;
334 newcs->levels_for_order= 1;
335
336 if (!strcmp(cs->csname,"ucs2") )
337 {
338#if defined(HAVE_CHARSET_ucs2) && defined(HAVE_UCA_COLLATIONS)
339 copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
340 &my_charset_ucs2_unicode_nopad_ci :
341 &my_charset_ucs2_unicode_ci,
342 cs);
343 newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
344#endif
345 }
346 else if (!strcmp(cs->csname, "utf8") || !strcmp(cs->csname, "utf8mb3"))
347 {
348#if defined (HAVE_CHARSET_utf8) && defined(HAVE_UCA_COLLATIONS)
349 copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
350 &my_charset_utf8_unicode_nopad_ci :
351 &my_charset_utf8_unicode_ci,
352 cs);
353 newcs->ctype= my_charset_utf8_unicode_ci.ctype;
354 if (init_state_maps(newcs))
355 return MY_XML_ERROR;
356#endif
357 }
358 else if (!strcmp(cs->csname, "utf8mb4"))
359 {
360#if defined (HAVE_CHARSET_utf8mb4) && defined(HAVE_UCA_COLLATIONS)
361 copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
362 &my_charset_utf8mb4_unicode_nopad_ci :
363 &my_charset_utf8mb4_unicode_ci,
364 cs);
365 newcs->ctype= my_charset_utf8mb4_unicode_ci.ctype;
366 newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED;
367#endif
368 }
369 else if (!strcmp(cs->csname, "utf16"))
370 {
371#if defined (HAVE_CHARSET_utf16) && defined(HAVE_UCA_COLLATIONS)
372 copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
373 &my_charset_utf16_unicode_nopad_ci :
374 &my_charset_utf16_unicode_ci,
375 cs);
376 newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
377#endif
378 }
379 else if (!strcmp(cs->csname, "utf32"))
380 {
381#if defined (HAVE_CHARSET_utf32) && defined(HAVE_UCA_COLLATIONS)
382 copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
383 &my_charset_utf32_unicode_nopad_ci :
384 &my_charset_utf32_unicode_ci,
385 cs);
386 newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
387#endif
388 }
389 else
390 {
391 simple_cs_init_functions(newcs);
392 newcs->mbminlen= 1;
393 newcs->mbmaxlen= 1;
394 newcs->strxfrm_multiply= 1;
395 if (simple_cs_is_full(newcs))
396 {
397 newcs->state |= MY_CS_LOADED;
398 }
399 newcs->state|= MY_CS_AVAILABLE;
400 }
401 }
402 else
403 {
404 /*
405 We need the below to make get_charset_name()
406 and get_charset_number() working even if a
407 character set has not been really incompiled.
408 The above functions are used for example
409 in error message compiler extra/comp_err.c.
410 If a character set was compiled, this information
411 will get lost and overwritten in add_compiled_collation().
412 */
413 newcs->number= cs->number;
414 if (cs->comment)
415 if (!(newcs->comment= my_once_strdup(cs->comment,MYF(MY_WME))))
416 return MY_XML_ERROR;
417 if (cs->csname)
418 if (!(newcs->csname= my_once_strdup(cs->csname,MYF(MY_WME))))
419 return MY_XML_ERROR;
420 if (cs->name)
421 if (!(newcs->name= my_once_strdup(cs->name,MYF(MY_WME))))
422 return MY_XML_ERROR;
423 }
424 cs->number= 0;
425 cs->primary_number= 0;
426 cs->binary_number= 0;
427 cs->name= NULL;
428 cs->state= 0;
429 cs->sort_order= NULL;
430 cs->tailoring= NULL;
431 }
432 return MY_XML_OK;
433}
434
435
436/**
437 Report character set initialization errors and warnings.
438 Be silent by default: no warnings on the client side.
439*/
440static void
441default_reporter(enum loglevel level __attribute__ ((unused)),
442 const char *format __attribute__ ((unused)),
443 ...)
444{
445}
446my_error_reporter my_charset_error_reporter= default_reporter;
447
448
449/**
450 Wrappers for memory functions my_malloc (and friends)
451 with C-compatbile API without extra "myf" argument.
452*/
453static void *
454my_once_alloc_c(size_t size)
455{ return my_once_alloc(size, MYF(MY_WME)); }
456
457
458static void *
459my_malloc_c(size_t size)
460{ return my_malloc(size, MYF(MY_WME)); }
461
462
463static void *
464my_realloc_c(void *old, size_t size)
465{ return my_realloc(old, size, MYF(MY_WME|MY_ALLOW_ZERO_PTR)); }
466
467
468/**
469 Initialize character set loader to use mysys memory management functions.
470 @param loader Loader to initialize
471*/
472void
473my_charset_loader_init_mysys(MY_CHARSET_LOADER *loader)
474{
475 loader->error[0]= '\0';
476 loader->once_alloc= my_once_alloc_c;
477 loader->malloc= my_malloc_c;
478 loader->realloc= my_realloc_c;
479 loader->free= my_free;
480 loader->reporter= my_charset_error_reporter;
481 loader->add_collation= add_collation;
482}
483
484
485#define MY_MAX_ALLOWED_BUF 1024*1024
486#define MY_CHARSET_INDEX "Index.xml"
487
488const char *charsets_dir= NULL;
489
490
491static my_bool
492my_read_charset_file(MY_CHARSET_LOADER *loader,
493 const char *filename,
494 myf myflags)
495{
496 uchar *buf;
497 int fd;
498 size_t len, tmp_len;
499 MY_STAT stat_info;
500
501 if (!my_stat(filename, &stat_info, MYF(myflags)) ||
502 ((len= (uint)stat_info.st_size) > MY_MAX_ALLOWED_BUF) ||
503 !(buf= (uchar*) my_malloc(len,myflags)))
504 return TRUE;
505
506 if ((fd= mysql_file_open(key_file_charset, filename, O_RDONLY, myflags)) < 0)
507 goto error;
508 tmp_len= mysql_file_read(fd, buf, len, myflags);
509 mysql_file_close(fd, myflags);
510 if (tmp_len != len)
511 goto error;
512
513 if (my_parse_charset_xml(loader, (char *) buf, len))
514 {
515 my_printf_error(EE_UNKNOWN_CHARSET, "Error while parsing '%s': %s\n",
516 MYF(0), filename, loader->error);
517 goto error;
518 }
519
520 my_free(buf);
521 return FALSE;
522
523error:
524 my_free(buf);
525 return TRUE;
526}
527
528
529char *get_charsets_dir(char *buf)
530{
531 const char *sharedir= SHAREDIR;
532 char *res;
533 DBUG_ENTER("get_charsets_dir");
534
535 if (charsets_dir != NULL)
536 strmake(buf, charsets_dir, FN_REFLEN-1);
537 else
538 {
539 if (test_if_hard_path(sharedir) ||
540 is_prefix(sharedir, DEFAULT_CHARSET_HOME))
541 strxmov(buf, sharedir, "/", CHARSET_DIR, NullS);
542 else
543 strxmov(buf, DEFAULT_CHARSET_HOME, "/", sharedir, "/", CHARSET_DIR,
544 NullS);
545 }
546 res= convert_dirname(buf,buf,NullS);
547 DBUG_PRINT("info",("charsets dir: '%s'", buf));
548 DBUG_RETURN(res);
549}
550
551CHARSET_INFO *all_charsets[MY_ALL_CHARSETS_SIZE]={NULL};
552CHARSET_INFO *default_charset_info = &my_charset_latin1;
553
554void add_compiled_collation(struct charset_info_st *cs)
555{
556 DBUG_ASSERT(cs->number < array_elements(all_charsets));
557 all_charsets[cs->number]= cs;
558 cs->state|= MY_CS_AVAILABLE;
559}
560
561
562static my_pthread_once_t charsets_initialized= MY_PTHREAD_ONCE_INIT;
563static my_pthread_once_t charsets_template= MY_PTHREAD_ONCE_INIT;
564
565typedef struct
566{
567 ulonglong use_count;
568} MY_COLLATION_STATISTICS;
569
570
571static MY_COLLATION_STATISTICS my_collation_statistics[MY_ALL_CHARSETS_SIZE];
572
573
574my_bool my_collation_is_known_id(uint id)
575{
576 return id > 0 && id < array_elements(all_charsets) && all_charsets[id] ?
577 TRUE : FALSE;
578}
579
580
581/*
582 Collation use statistics functions do not lock
583 counters to avoid mutex contention. This can lose
584 some counter increments with high thread concurrency.
585 But this should be Ok, as we don't need exact numbers.
586*/
587static inline void my_collation_statistics_inc_use_count(uint id)
588{
589 DBUG_ASSERT(my_collation_is_known_id(id));
590 my_collation_statistics[id].use_count++;
591}
592
593
594ulonglong my_collation_statistics_get_use_count(uint id)
595{
596 DBUG_ASSERT(my_collation_is_known_id(id));
597 return my_collation_statistics[id].use_count;
598}
599
600
601const char *my_collation_get_tailoring(uint id)
602{
603 /* all_charsets[id]->tailoring is never changed after server startup. */
604 DBUG_ASSERT(my_collation_is_known_id(id));
605 return all_charsets[id]->tailoring;
606}
607
608
609static void init_available_charsets(void)
610{
611 char fname[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
612 struct charset_info_st **cs;
613 MY_CHARSET_LOADER loader;
614
615 bzero((char*) &all_charsets,sizeof(all_charsets));
616 bzero((char*) &my_collation_statistics, sizeof(my_collation_statistics));
617 init_compiled_charsets(MYF(0));
618
619 /* Copy compiled charsets */
620 for (cs= (struct charset_info_st**) all_charsets;
621 cs < (struct charset_info_st**) all_charsets +
622 array_elements(all_charsets)-1 ;
623 cs++)
624 {
625 if (*cs)
626 {
627 DBUG_ASSERT(cs[0]->mbmaxlen <= MY_CS_MBMAXLEN);
628 if (cs[0]->ctype)
629 if (init_state_maps(*cs))
630 *cs= NULL;
631 }
632 }
633
634 my_charset_loader_init_mysys(&loader);
635 strmov(get_charsets_dir(fname), MY_CHARSET_INDEX);
636 my_read_charset_file(&loader, fname, MYF(0));
637}
638
639
640void free_charsets(void)
641{
642 charsets_initialized= charsets_template;
643}
644
645
646static const char*
647get_collation_name_alias(const char *name, char *buf, size_t bufsize)
648{
649 if (!strncasecmp(name, "utf8mb3_", 8))
650 {
651 my_snprintf(buf, bufsize, "utf8_%s", name + 8);
652 return buf;
653 }
654 return NULL;
655}
656
657
658uint get_collation_number(const char *name)
659{
660 uint id;
661 char alias[64];
662 my_pthread_once(&charsets_initialized, init_available_charsets);
663 if ((id= get_collation_number_internal(name)))
664 return id;
665 if ((name= get_collation_name_alias(name, alias, sizeof(alias))))
666 return get_collation_number_internal(name);
667 return 0;
668}
669
670
671static uint
672get_charset_number_internal(const char *charset_name, uint cs_flags)
673{
674 CHARSET_INFO **cs;
675
676 for (cs= all_charsets;
677 cs < all_charsets + array_elements(all_charsets);
678 cs++)
679 {
680 if ( cs[0] && cs[0]->csname && (cs[0]->state & cs_flags) &&
681 !my_strcasecmp(&my_charset_latin1, cs[0]->csname, charset_name))
682 return cs[0]->number;
683 }
684 return 0;
685}
686
687
688static const char*
689get_charset_name_alias(const char *name)
690{
691 if (!my_strcasecmp(&my_charset_latin1, name, "utf8mb3"))
692 return "utf8";
693 return NULL;
694}
695
696
697uint get_charset_number(const char *charset_name, uint cs_flags)
698{
699 uint id;
700 my_pthread_once(&charsets_initialized, init_available_charsets);
701 if ((id= get_charset_number_internal(charset_name, cs_flags)))
702 return id;
703 if ((charset_name= get_charset_name_alias(charset_name)))
704 return get_charset_number_internal(charset_name, cs_flags);
705 return 0;
706}
707
708
709const char *get_charset_name(uint charset_number)
710{
711 my_pthread_once(&charsets_initialized, init_available_charsets);
712
713 if (charset_number < array_elements(all_charsets))
714 {
715 CHARSET_INFO *cs= all_charsets[charset_number];
716
717 if (cs && (cs->number == charset_number) && cs->name)
718 return (char*) cs->name;
719 }
720
721 return "?"; /* this mimics find_type() */
722}
723
724
725static CHARSET_INFO *inheritance_source_by_id(CHARSET_INFO *cs, uint refid)
726{
727 CHARSET_INFO *refcs;
728 return refid && refid != cs->number &&
729 (refcs= all_charsets[refid]) &&
730 (refcs->state & MY_CS_AVAILABLE) ? refcs : NULL;
731}
732
733
734static CHARSET_INFO *find_collation_data_inheritance_source(CHARSET_INFO *cs)
735{
736 const char *beg, *end;
737 if (cs->tailoring &&
738 !strncmp(cs->tailoring, "[import ", 8) &&
739 (end= strchr(cs->tailoring + 8, ']')) &&
740 (beg= cs->tailoring + 8) + MY_CS_NAME_SIZE > end)
741 {
742 char name[MY_CS_NAME_SIZE + 1];
743 memcpy(name, beg, end - beg);
744 name[end - beg]= '\0';
745 return inheritance_source_by_id(cs, get_collation_number(name));
746 }
747 return NULL;
748}
749
750
751static CHARSET_INFO *find_charset_data_inheritance_source(CHARSET_INFO *cs)
752{
753 uint refid= get_charset_number_internal(cs->csname, MY_CS_PRIMARY);
754 return inheritance_source_by_id(cs, refid);
755}
756
757
758static CHARSET_INFO *
759get_internal_charset(MY_CHARSET_LOADER *loader, uint cs_number, myf flags)
760{
761 char buf[FN_REFLEN];
762 struct charset_info_st *cs;
763
764 DBUG_ASSERT(cs_number < array_elements(all_charsets));
765
766 if ((cs= (struct charset_info_st*) all_charsets[cs_number]))
767 {
768 if (cs->state & MY_CS_READY) /* if CS is already initialized */
769 {
770 my_collation_statistics_inc_use_count(cs_number);
771 return cs;
772 }
773
774 /*
775 To make things thread safe we are not allowing other threads to interfere
776 while we may changing the cs_info_table
777 */
778 mysql_mutex_lock(&THR_LOCK_charset);
779
780 if (!(cs->state & (MY_CS_COMPILED|MY_CS_LOADED))) /* if CS is not in memory */
781 {
782 MY_CHARSET_LOADER loader;
783 strxmov(get_charsets_dir(buf), cs->csname, ".xml", NullS);
784 my_charset_loader_init_mysys(&loader);
785 my_read_charset_file(&loader, buf, flags);
786 }
787
788 if (cs->state & MY_CS_AVAILABLE)
789 {
790 if (!(cs->state & MY_CS_READY))
791 {
792 if (!simple_8bit_charset_data_is_full(cs))
793 {
794 CHARSET_INFO *refcs= find_charset_data_inheritance_source(cs);
795 if (refcs)
796 inherit_charset_data(cs, refcs);
797 }
798 if (!simple_8bit_collation_data_is_full(cs))
799 {
800 CHARSET_INFO *refcl= find_collation_data_inheritance_source(cs);
801 if (refcl)
802 inherit_collation_data(cs, refcl);
803 }
804
805 if ((cs->cset->init && cs->cset->init(cs, loader)) ||
806 (cs->coll->init && cs->coll->init(cs, loader)))
807 {
808 cs= NULL;
809 }
810 else
811 cs->state|= MY_CS_READY;
812 }
813 my_collation_statistics_inc_use_count(cs_number);
814 }
815 else
816 cs= NULL;
817
818 mysql_mutex_unlock(&THR_LOCK_charset);
819 }
820 return cs;
821}
822
823
824CHARSET_INFO *get_charset(uint cs_number, myf flags)
825{
826 CHARSET_INFO *cs= NULL;
827
828 if (cs_number == default_charset_info->number)
829 return default_charset_info;
830
831 my_pthread_once(&charsets_initialized, init_available_charsets);
832
833 if (cs_number < array_elements(all_charsets))
834 {
835 MY_CHARSET_LOADER loader;
836 my_charset_loader_init_mysys(&loader);
837 cs= get_internal_charset(&loader, cs_number, flags);
838 }
839
840 if (!cs && (flags & MY_WME))
841 {
842 char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)], cs_string[23];
843 strmov(get_charsets_dir(index_file),MY_CHARSET_INDEX);
844 cs_string[0]='#';
845 int10_to_str(cs_number, cs_string+1, 10);
846 my_error(EE_UNKNOWN_CHARSET, MYF(ME_BELL), cs_string, index_file);
847 }
848 return cs;
849}
850
851
852/**
853 Find collation by name: extended version of get_charset_by_name()
854 to return error messages to the caller.
855 @param loader Character set loader
856 @param name Collation name
857 @param flags Flags
858 @return NULL on error, pointer to collation on success
859*/
860
861CHARSET_INFO *
862my_collation_get_by_name(MY_CHARSET_LOADER *loader,
863 const char *name, myf flags)
864{
865 uint cs_number;
866 CHARSET_INFO *cs;
867 my_pthread_once(&charsets_initialized, init_available_charsets);
868
869 cs_number= get_collation_number(name);
870 my_charset_loader_init_mysys(loader);
871 cs= cs_number ? get_internal_charset(loader, cs_number, flags) : NULL;
872
873 if (!cs && (flags & MY_WME))
874 {
875 char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
876 strmov(get_charsets_dir(index_file),MY_CHARSET_INDEX);
877 my_error(EE_UNKNOWN_COLLATION, MYF(ME_BELL), name, index_file);
878 }
879 return cs;
880}
881
882
883CHARSET_INFO *get_charset_by_name(const char *cs_name, myf flags)
884{
885 MY_CHARSET_LOADER loader;
886 my_charset_loader_init_mysys(&loader);
887 return my_collation_get_by_name(&loader, cs_name, flags);
888}
889
890
891/**
892 Find character set by name: extended version of get_charset_by_csname()
893 to return error messages to the caller.
894 @param loader Character set loader
895 @param name Collation name
896 @param cs_flags Character set flags (e.g. default or binary collation)
897 @param flags Flags
898 @return NULL on error, pointer to collation on success
899*/
900CHARSET_INFO *
901my_charset_get_by_name(MY_CHARSET_LOADER *loader,
902 const char *cs_name, uint cs_flags, myf flags)
903{
904 uint cs_number;
905 CHARSET_INFO *cs;
906 DBUG_ENTER("get_charset_by_csname");
907 DBUG_PRINT("enter",("name: '%s'", cs_name));
908
909 my_pthread_once(&charsets_initialized, init_available_charsets);
910
911 cs_number= get_charset_number(cs_name, cs_flags);
912 cs= cs_number ? get_internal_charset(loader, cs_number, flags) : NULL;
913
914 if (!cs && (flags & MY_WME))
915 {
916 char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
917 strmov(get_charsets_dir(index_file),MY_CHARSET_INDEX);
918 my_error(EE_UNKNOWN_CHARSET, MYF(ME_BELL), cs_name, index_file);
919 }
920
921 DBUG_RETURN(cs);
922}
923
924
925CHARSET_INFO *
926get_charset_by_csname(const char *cs_name, uint cs_flags, myf flags)
927{
928 MY_CHARSET_LOADER loader;
929 my_charset_loader_init_mysys(&loader);
930 return my_charset_get_by_name(&loader, cs_name, cs_flags, flags);
931}
932
933
934/**
935 Resolve character set by the character set name (utf8, latin1, ...).
936
937 The function tries to resolve character set by the specified name. If
938 there is character set with the given name, it is assigned to the "cs"
939 parameter and FALSE is returned. If there is no such character set,
940 "default_cs" is assigned to the "cs" and TRUE is returned.
941
942 @param[in] cs_name Character set name.
943 @param[in] default_cs Default character set.
944 @param[out] cs Variable to store character set.
945
946 @return FALSE if character set was resolved successfully; TRUE if there
947 is no character set with given name.
948*/
949
950my_bool resolve_charset(const char *cs_name,
951 CHARSET_INFO *default_cs,
952 CHARSET_INFO **cs)
953{
954 *cs= get_charset_by_csname(cs_name, MY_CS_PRIMARY, MYF(0));
955
956 if (*cs == NULL)
957 {
958 *cs= default_cs;
959 return TRUE;
960 }
961
962 return FALSE;
963}
964
965
966/**
967 Resolve collation by the collation name (utf8_general_ci, ...).
968
969 The function tries to resolve collation by the specified name. If there
970 is collation with the given name, it is assigned to the "cl" parameter
971 and FALSE is returned. If there is no such collation, "default_cl" is
972 assigned to the "cl" and TRUE is returned.
973
974 @param[out] cl Variable to store collation.
975 @param[in] cl_name Collation name.
976 @param[in] default_cl Default collation.
977
978 @return FALSE if collation was resolved successfully; TRUE if there is no
979 collation with given name.
980*/
981
982my_bool resolve_collation(const char *cl_name,
983 CHARSET_INFO *default_cl,
984 CHARSET_INFO **cl)
985{
986 *cl= get_charset_by_name(cl_name, MYF(0));
987
988 if (*cl == NULL)
989 {
990 *cl= default_cl;
991 return TRUE;
992 }
993
994 return FALSE;
995}
996
997
998/*
999 Escape string with backslashes (\)
1000
1001 SYNOPSIS
1002 escape_string_for_mysql()
1003 charset_info Charset of the strings
1004 to Buffer for escaped string
1005 to_length Length of destination buffer, or 0
1006 from The string to escape
1007 length The length of the string to escape
1008
1009 DESCRIPTION
1010 This escapes the contents of a string by adding backslashes before special
1011 characters, and turning others into specific escape sequences, such as
1012 turning newlines into \n and null bytes into \0.
1013
1014 NOTE
1015 To maintain compatibility with the old C API, to_length may be 0 to mean
1016 "big enough"
1017
1018 RETURN VALUES
1019 (size_t) -1 The escaped string did not fit in the to buffer
1020 # The length of the escaped string
1021*/
1022
1023size_t escape_string_for_mysql(CHARSET_INFO *charset_info,
1024 char *to, size_t to_length,
1025 const char *from, size_t length)
1026{
1027 const char *to_start= to;
1028 const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
1029 my_bool overflow= FALSE;
1030 for (end= from + length; from < end; from++)
1031 {
1032 char escape= 0;
1033#ifdef USE_MB
1034 int tmp_length= use_mb(charset_info) ? my_charlen(charset_info, from, end) :
1035 1;
1036 if (tmp_length > 1)
1037 {
1038 if (to + tmp_length > to_end)
1039 {
1040 overflow= TRUE;
1041 break;
1042 }
1043 while (tmp_length--)
1044 *to++= *from++;
1045 from--;
1046 continue;
1047 }
1048 /*
1049 If the next character appears to begin a multi-byte character, we
1050 escape that first byte of that apparent multi-byte character. (The
1051 character just looks like a multi-byte character -- if it were actually
1052 a multi-byte character, it would have been passed through in the test
1053 above.)
1054
1055 Without this check, we can create a problem by converting an invalid
1056 multi-byte character into a valid one. For example, 0xbf27 is not
1057 a valid GBK character, but 0xbf5c is. (0x27 = ', 0x5c = \)
1058 */
1059 if (tmp_length < 1) /* Bad byte sequence */
1060 escape= *from;
1061 else
1062#endif
1063 switch (*from) {
1064 case 0: /* Must be escaped for 'mysql' */
1065 escape= '0';
1066 break;
1067 case '\n': /* Must be escaped for logs */
1068 escape= 'n';
1069 break;
1070 case '\r':
1071 escape= 'r';
1072 break;
1073 case '\\':
1074 escape= '\\';
1075 break;
1076 case '\'':
1077 escape= '\'';
1078 break;
1079 case '"': /* Better safe than sorry */
1080 escape= '"';
1081 break;
1082 case '\032': /* This gives problems on Win32 */
1083 escape= 'Z';
1084 break;
1085 }
1086 if (escape)
1087 {
1088 if (to + 2 > to_end)
1089 {
1090 overflow= TRUE;
1091 break;
1092 }
1093 *to++= '\\';
1094 *to++= escape;
1095 }
1096 else
1097 {
1098 if (to + 1 > to_end)
1099 {
1100 overflow= TRUE;
1101 break;
1102 }
1103 *to++= *from;
1104 }
1105 }
1106 *to= 0;
1107 return overflow ? (size_t) -1 : (size_t) (to - to_start);
1108}
1109
1110
1111#ifdef BACKSLASH_MBTAIL
1112static CHARSET_INFO *fs_cset_cache= NULL;
1113
1114CHARSET_INFO *fs_character_set()
1115{
1116 if (!fs_cset_cache)
1117 {
1118 char buf[10]= "cp";
1119 GetLocaleInfo(LOCALE_SYSTEM_DEFAULT, LOCALE_IDEFAULTANSICODEPAGE,
1120 buf+2, sizeof(buf)-3);
1121 /*
1122 We cannot call get_charset_by_name here
1123 because fs_character_set() is executed before
1124 LOCK_THD_charset mutex initialization, which
1125 is used inside get_charset_by_name.
1126 As we're now interested in cp932 only,
1127 let's just detect it using strcmp().
1128 */
1129 fs_cset_cache=
1130 #ifdef HAVE_CHARSET_cp932
1131 !strcmp(buf, "cp932") ? &my_charset_cp932_japanese_ci :
1132 #endif
1133 &my_charset_bin;
1134 }
1135 return fs_cset_cache;
1136}
1137#endif
1138
1139/*
1140 Escape apostrophes by doubling them up
1141
1142 SYNOPSIS
1143 escape_quotes_for_mysql()
1144 charset_info Charset of the strings
1145 to Buffer for escaped string
1146 to_length Length of destination buffer, or 0
1147 from The string to escape
1148 length The length of the string to escape
1149
1150 DESCRIPTION
1151 This escapes the contents of a string by doubling up any apostrophes that
1152 it contains. This is used when the NO_BACKSLASH_ESCAPES SQL_MODE is in
1153 effect on the server.
1154
1155 NOTE
1156 To be consistent with escape_string_for_mysql(), to_length may be 0 to
1157 mean "big enough"
1158
1159 RETURN VALUES
1160 ~0 The escaped string did not fit in the to buffer
1161 >=0 The length of the escaped string
1162*/
1163
1164size_t escape_quotes_for_mysql(CHARSET_INFO *charset_info,
1165 char *to, size_t to_length,
1166 const char *from, size_t length)
1167{
1168 const char *to_start= to;
1169 const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
1170 my_bool overflow= FALSE;
1171#ifdef USE_MB
1172 my_bool use_mb_flag= use_mb(charset_info);
1173#endif
1174 for (end= from + length; from < end; from++)
1175 {
1176#ifdef USE_MB
1177 int tmp_length;
1178 if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
1179 {
1180 if (to + tmp_length > to_end)
1181 {
1182 overflow= TRUE;
1183 break;
1184 }
1185 while (tmp_length--)
1186 *to++= *from++;
1187 from--;
1188 continue;
1189 }
1190 /*
1191 We don't have the same issue here with a non-multi-byte character being
1192 turned into a multi-byte character by the addition of an escaping
1193 character, because we are only escaping the ' character with itself.
1194 */
1195#endif
1196 if (*from == '\'')
1197 {
1198 if (to + 2 > to_end)
1199 {
1200 overflow= TRUE;
1201 break;
1202 }
1203 *to++= '\'';
1204 *to++= '\'';
1205 }
1206 else
1207 {
1208 if (to + 1 > to_end)
1209 {
1210 overflow= TRUE;
1211 break;
1212 }
1213 *to++= *from;
1214 }
1215 }
1216 *to= 0;
1217 return overflow ? (ulong)~0 : (ulong) (to - to_start);
1218}
1219