charset.c source code [MariaDB/mysys/charset.c]

1	/*
2	Copyright (c) 2000, 2011, Oracle and/or its affiliates
3
4	This program is free software; you can redistribute it and/or modify
5	it under the terms of the GNU General Public License as published by
6	the Free Software Foundation; version 2 of the License.
7
8	This program is distributed in the hope that it will be useful,
9	but WITHOUT ANY WARRANTY; without even the implied warranty of
10	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11	GNU General Public License for more details.
12
13	You should have received a copy of the GNU General Public License
14	along with this program; if not, write to the Free Software
15	Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA /*
16
17	#include "mysys_priv.h"
18	#include "mysys_err.h"
19	#include <m_ctype.h>
20	#include <m_string.h>
21	#include <my_dir.h>
22	#include <my_xml.h>
23
24
25	/*
26	The code below implements this functionality:
27
28	- Initializing charset related structures
29	- Loading dynamic charsets
30	- Searching for a proper CHARSET_INFO
31	using charset name, collation name or collation ID
32	- Setting server default character set
33	*/
34
35	my_bool my_charset_same(CHARSET_INFO cs1, CHARSET_INFO cs2)
36	{
37	return ((cs1 == cs2) \|\| !strcmp(cs1->csname,cs2->csname));
38	}
39
40
41	static uint
42	get_collation_number_internal(const char *name)
43	{
44	CHARSET_INFO **cs;
45	for (cs= all_charsets;
46	cs < all_charsets + array_elements(all_charsets);
47	cs++)
48	{
49	if ( cs[`0`] && cs[`0`]->name &&
50	!my_strcasecmp(&my_charset_latin1, cs[`0`]->name, name))
51	return cs[`0`]->number;
52	}
53	return `0`;
54	}
55
56
57	static my_bool is_multi_byte_ident(CHARSET_INFO *cs, uchar ch)
58	{
59	int chlen= my_charlen(cs, (const char ) &ch, (const* char *) &ch + `1`);
60	return MY_CS_IS_TOOSMALL(chlen) ? TRUE : FALSE;
61	}
62
63	static my_bool init_state_maps(struct charset_info_st *cs)
64	{
65	uint i;
66	uchar *state_map;
67	uchar *ident_map;
68
69	if (!(cs->state_map= state_map= (uchar*) my_once_alloc(`256`, MYF(MY_WME))))
70	return `1`;
71
72	if (!(cs->ident_map= ident_map= (uchar*) my_once_alloc(`256`, MYF(MY_WME))))
73	return `1`;
74
75	/ Fill state_map with states to get a faster parser /
76	for (i=`0`; i < `256` ; i++)
77	{
78	if (my_isalpha(cs,i))
79	state_map[i]=(uchar) MY_LEX_IDENT;
80	else if (my_isdigit(cs,i))
81	state_map[i]=(uchar) MY_LEX_NUMBER_IDENT;
82	else if (is_multi_byte_ident(cs, i))
83	state_map[i]=(uchar) MY_LEX_IDENT;
84	else if (my_isspace(cs,i))
85	state_map[i]=(uchar) MY_LEX_SKIP;
86	else
87	state_map[i]=(uchar) MY_LEX_CHAR;
88	}
89	state_map[(uchar)`'_'`]=state_map[(uchar)`'$'`]=(uchar) MY_LEX_IDENT;
90	state_map[(uchar)`'\''`]=(uchar) MY_LEX_STRING;
91	state_map[(uchar)`'.'`]=(uchar) MY_LEX_REAL_OR_POINT;
92	state_map[(uchar)`'>'`]=state_map[(uchar)`'='`]=state_map[(uchar)`'!'`]= (uchar) MY_LEX_CMP_OP;
93	state_map[(uchar)`'<'`]= (uchar) MY_LEX_LONG_CMP_OP;
94	state_map[(uchar)`'&'`]=state_map[(uchar)`'\|'`]=(uchar) MY_LEX_BOOL;
95	state_map[(uchar)`'#'`]=(uchar) MY_LEX_COMMENT;
96	state_map[(uchar)`';'`]=(uchar) MY_LEX_SEMICOLON;
97	state_map[(uchar)`':'`]=(uchar) MY_LEX_SET_VAR;
98	state_map[`0`]=(uchar) MY_LEX_EOL;
99	state_map[(uchar)`'\\'`]= (uchar) MY_LEX_ESCAPE;
100	state_map[(uchar)`'/'`]= (uchar) MY_LEX_LONG_COMMENT;
101	state_map[(uchar)`'*'`]= (uchar) MY_LEX_END_LONG_COMMENT;
102	state_map[(uchar)`'@'`]= (uchar) MY_LEX_USER_END;
103	state_map[(uchar) '`']= (uchar) MY_LEX_USER_VARIABLE_DELIMITER;
104	state_map[(uchar)`'"'`]= (uchar) MY_LEX_STRING_OR_DELIMITER;
105	state_map[(uchar)`'-'`]= (uchar) MY_LEX_MINUS_OR_COMMENT;
106	state_map[(uchar)`','`]= (uchar) MY_LEX_COMMA;
107	state_map[(uchar)`'?'`]= (uchar) MY_LEX_PLACEHOLDER;
108
109	/*
110	Create a second map to make it faster to find identifiers
111	*/
112	for (i=`0`; i < `256` ; i++)
113	{
114	ident_map[i]= (uchar) (state_map[i] == MY_LEX_IDENT \|\|
115	state_map[i] == MY_LEX_NUMBER_IDENT);
116	}
117
118	/ Special handling of hex and binary strings /
119	state_map[(uchar)`'x'`]= state_map[(uchar)`'X'`]= (uchar) MY_LEX_IDENT_OR_HEX;
120	state_map[(uchar)`'b'`]= state_map[(uchar)`'B'`]= (uchar) MY_LEX_IDENT_OR_BIN;
121	state_map[(uchar)`'n'`]= state_map[(uchar)`'N'`]= (uchar) MY_LEX_IDENT_OR_NCHAR;
122	return `0`;
123	}
124
125
126	static MY_COLLATION_HANDLER *get_simple_collation_handler_by_flags(uint flags)
127	{
128	return flags & MY_CS_BINSORT ?
129	(flags & MY_CS_NOPAD ?
130	&my_collation_8bit_nopad_bin_handler :
131	&my_collation_8bit_bin_handler) :
132	(flags & MY_CS_NOPAD ?
133	&my_collation_8bit_simple_nopad_ci_handler :
134	&my_collation_8bit_simple_ci_handler);
135	}
136
137
138	static void simple_cs_init_functions(struct charset_info_st *cs)
139	{
140	cs->coll= get_simple_collation_handler_by_flags(cs->state);
141	cs->cset= &my_charset_8bit_handler;
142	}
143
144
145
146	static int cs_copy_data(struct charset_info_st to, CHARSET_INFO from)
147	{
148	to->number= from->number ? from->number : to->number;
149
150	if (from->csname)
151	if (!(to->csname= my_once_strdup(from->csname,MYF(MY_WME))))
152	goto err;
153
154	if (from->name)
155	if (!(to->name= my_once_strdup(from->name,MYF(MY_WME))))
156	goto err;
157
158	if (from->comment)
159	if (!(to->comment= my_once_strdup(from->comment,MYF(MY_WME))))
160	goto err;
161
162	if (from->ctype)
163	{
164	if (!(to->ctype= (uchar) my_once_memdup((char**) from->ctype,
165	MY_CS_CTYPE_TABLE_SIZE,
166	MYF(MY_WME))))
167	goto err;
168	if (init_state_maps(to))
169	goto err;
170	}
171	if (from->to_lower)
172	if (!(to->to_lower= (uchar) my_once_memdup((char**) from->to_lower,
173	MY_CS_TO_LOWER_TABLE_SIZE,
174	MYF(MY_WME))))
175	goto err;
176
177	if (from->to_upper)
178	if (!(to->to_upper= (uchar) my_once_memdup((char**) from->to_upper,
179	MY_CS_TO_UPPER_TABLE_SIZE,
180	MYF(MY_WME))))
181	goto err;
182	if (from->sort_order)
183	{
184	if (!(to->sort_order= (uchar) my_once_memdup((char**) from->sort_order,
185	MY_CS_SORT_ORDER_TABLE_SIZE,
186	MYF(MY_WME))))
187	goto err;
188
189	}
190	if (from->tab_to_uni)
191	{
192	uint sz= MY_CS_TO_UNI_TABLE_SIZE*sizeof(uint16);
193	if (!(to->tab_to_uni= (uint16) my_once_memdup((char**)from->tab_to_uni,
194	sz, MYF(MY_WME))))
195	goto err;
196	}
197	if (from->tailoring)
198	if (!(to->tailoring= my_once_strdup(from->tailoring,MYF(MY_WME))))
199	goto err;
200
201	return `0`;
202
203	err:
204	return `1`;
205	}
206
207
208	static my_bool simple_8bit_charset_data_is_full(CHARSET_INFO *cs)
209	{
210	return cs->ctype && cs->to_upper && cs->to_lower && cs->tab_to_uni;
211	}
212
213
214	/**
215	Inherit missing 8bit charset data from another collation.
216	Arrays pointed by refcs must be in the permanent memory already,
217	e.g. static memory, or allocated by my_once_xxx().
218	*/
219	static void
220	inherit_charset_data(struct charset_info_st cs, CHARSET_INFO refcs)
221	{
222	if (!cs->to_upper)
223	cs->to_upper= refcs->to_upper;
224	if (!cs->to_lower)
225	cs->to_lower= refcs->to_lower;
226	if (!cs->ctype)
227	cs->ctype= refcs->ctype;
228	if (!cs->tab_to_uni)
229	cs->tab_to_uni= refcs->tab_to_uni;
230	}
231
232
233	static my_bool simple_8bit_collation_data_is_full(CHARSET_INFO *cs)
234	{
235	return cs->sort_order \|\| (cs->state & MY_CS_BINSORT);
236	}
237
238
239	/**
240	Inherit 8bit simple collation data from another collation.
241	refcs->sort_order must be in the permanent memory already,
242	e.g. static memory, or allocated by my_once_xxx().
243	*/
244	static void
245	inherit_collation_data(struct charset_info_st cs, CHARSET_INFO refcs)
246	{
247	if (!simple_8bit_collation_data_is_full(cs))
248	cs->sort_order= refcs->sort_order;
249	}
250
251
252	static my_bool simple_cs_is_full(CHARSET_INFO *cs)
253	{
254	return cs->number && cs->csname && cs->name &&
255	simple_8bit_charset_data_is_full(cs) &&
256	(simple_8bit_collation_data_is_full(cs) \|\| cs->tailoring);
257	}
258
259
260	#if defined(HAVE_UCA_COLLATIONS) && (defined(HAVE_CHARSET_ucs2) \|\| defined(HAVE_CHARSET_utf8))
261	/**
262	Initialize a loaded collation.
263	@param [OUT] to - The new charset_info_st structure to initialize.
264	@param [IN] from - A template collation, to fill the missing data from.
265	@param [IN] loaded - The collation data loaded from the LDML file.
266	some data may be missing in "loaded".
267	*/
268	static void
269	copy_uca_collation(struct charset_info_st to, CHARSET_INFO from,
270	CHARSET_INFO *loaded)
271	{
272	to->cset= from->cset;
273	to->coll= from->coll;
274	/*
275	Single-level UCA collation have strnxfrm_multiple=8.
276	In case of a multi-level UCA collation we use strnxfrm_multiply=4.
277	That means MY_COLLATION_HANDLER::strnfrmlen() will request the caller
278	to allocate a buffer smaller size for each level, for performance purpose,
279	and to fit longer VARCHARs to @@max_sort_length.
280	This makes filesort produce non-precise order for some rare Unicode
281	characters that produce more than 4 weights (long expansions).
282	UCA requires 2 bytes per weight multiplied by the number of levels.
283	In case of a 2-level collation, each character requires 42=8 bytes.*
284	Therefore, the longest VARCHAR that fits into the default @@max_sort_length
285	is 1024/8=VARCHAR(128). With strnxfrm_multiply==8, only VARCHAR(64)
286	would fit.
287	Note, the built-in collation utf8_thai_520_w2 also uses strnxfrm_multiply=4,
288	for the same purpose.
289	TODO: we could add a new LDML syntax to choose strxfrm_multiply value.
290	*/
291	to->strxfrm_multiply= loaded->levels_for_order > `1` ?
292	`4` : from->strxfrm_multiply;
293	to->min_sort_char= from->min_sort_char;
294	to->max_sort_char= from->max_sort_char;
295	to->mbminlen= from->mbminlen;
296	to->mbmaxlen= from->mbmaxlen;
297	to->caseup_multiply= from->caseup_multiply;
298	to->casedn_multiply= from->casedn_multiply;
299	to->state\|= MY_CS_AVAILABLE \| MY_CS_LOADED \|
300	MY_CS_STRNXFRM \| MY_CS_UNICODE;
301	}
302	#endif
303
304
305	static int add_collation(struct charset_info_st *cs)
306	{
307	if (cs->name && (cs->number \|\|
308	(cs->number=get_collation_number_internal(cs->name))) &&
309	cs->number < array_elements(all_charsets))
310	{
311	struct charset_info_st *newcs;
312	if (!(newcs= (struct charset_info_st*) all_charsets[cs->number]))
313	{
314	if (!(all_charsets[cs->number]= newcs=
315	(struct charset_info_st) my_once_alloc(sizeof*(CHARSET_INFO),MYF(`0`))))
316	return MY_XML_ERROR;
317	bzero(newcs,sizeof(CHARSET_INFO));
318	}
319
320	if (cs->primary_number == cs->number)
321	cs->state \|= MY_CS_PRIMARY;
322
323	if (cs->binary_number == cs->number)
324	cs->state \|= MY_CS_BINSORT;
325
326	newcs->state\|= cs->state;
327
328	if (!(newcs->state & MY_CS_COMPILED))
329	{
330	if (cs_copy_data(newcs,cs))
331	return MY_XML_ERROR;
332
333	newcs->caseup_multiply= newcs->casedn_multiply= `1`;
334	newcs->levels_for_order= `1`;
335
336	if (!strcmp(cs->csname,"ucs2") )
337	{
338	#if defined(HAVE_CHARSET_ucs2) && defined(HAVE_UCA_COLLATIONS)
339	copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
340	&my_charset_ucs2_unicode_nopad_ci :
341	&my_charset_ucs2_unicode_ci,
342	cs);
343	newcs->state\|= MY_CS_AVAILABLE \| MY_CS_LOADED \| MY_CS_NONASCII;
344	#endif
345	}
346	else if (!strcmp(cs->csname, "utf8") \|\| !strcmp(cs->csname, "utf8mb3"))
347	{
348	#if defined (HAVE_CHARSET_utf8) && defined(HAVE_UCA_COLLATIONS)
349	copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
350	&my_charset_utf8_unicode_nopad_ci :
351	&my_charset_utf8_unicode_ci,
352	cs);
353	newcs->ctype= my_charset_utf8_unicode_ci.ctype;
354	if (init_state_maps(newcs))
355	return MY_XML_ERROR;
356	#endif
357	}
358	else if (!strcmp(cs->csname, "utf8mb4"))
359	{
360	#if defined (HAVE_CHARSET_utf8mb4) && defined(HAVE_UCA_COLLATIONS)
361	copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
362	&my_charset_utf8mb4_unicode_nopad_ci :
363	&my_charset_utf8mb4_unicode_ci,
364	cs);
365	newcs->ctype= my_charset_utf8mb4_unicode_ci.ctype;
366	newcs->state\|= MY_CS_AVAILABLE \| MY_CS_LOADED;
367	#endif
368	}
369	else if (!strcmp(cs->csname, "utf16"))
370	{
371	#if defined (HAVE_CHARSET_utf16) && defined(HAVE_UCA_COLLATIONS)
372	copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
373	&my_charset_utf16_unicode_nopad_ci :
374	&my_charset_utf16_unicode_ci,
375	cs);
376	newcs->state\|= MY_CS_AVAILABLE \| MY_CS_LOADED \| MY_CS_NONASCII;
377	#endif
378	}
379	else if (!strcmp(cs->csname, "utf32"))
380	{
381	#if defined (HAVE_CHARSET_utf32) && defined(HAVE_UCA_COLLATIONS)
382	copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
383	&my_charset_utf32_unicode_nopad_ci :
384	&my_charset_utf32_unicode_ci,
385	cs);
386	newcs->state\|= MY_CS_AVAILABLE \| MY_CS_LOADED \| MY_CS_NONASCII;
387	#endif
388	}
389	else
390	{
391	simple_cs_init_functions(newcs);
392	newcs->mbminlen= `1`;
393	newcs->mbmaxlen= `1`;
394	newcs->strxfrm_multiply= `1`;
395	if (simple_cs_is_full(newcs))
396	{
397	newcs->state \|= MY_CS_LOADED;
398	}
399	newcs->state\|= MY_CS_AVAILABLE;
400	}
401	}
402	else
403	{
404	/*
405	We need the below to make get_charset_name()
406	and get_charset_number() working even if a
407	character set has not been really incompiled.
408	The above functions are used for example
409	in error message compiler extra/comp_err.c.
410	If a character set was compiled, this information
411	will get lost and overwritten in add_compiled_collation().
412	*/
413	newcs->number= cs->number;
414	if (cs->comment)
415	if (!(newcs->comment= my_once_strdup(cs->comment,MYF(MY_WME))))
416	return MY_XML_ERROR;
417	if (cs->csname)
418	if (!(newcs->csname= my_once_strdup(cs->csname,MYF(MY_WME))))
419	return MY_XML_ERROR;
420	if (cs->name)
421	if (!(newcs->name= my_once_strdup(cs->name,MYF(MY_WME))))
422	return MY_XML_ERROR;
423	}
424	cs->number= `0`;
425	cs->primary_number= `0`;
426	cs->binary_number= `0`;
427	cs->name= NULL;
428	cs->state= `0`;
429	cs->sort_order= NULL;
430	cs->tailoring= NULL;
431	}
432	return MY_XML_OK;
433	}
434
435
436	/**
437	Report character set initialization errors and warnings.
438	Be silent by default: no warnings on the client side.
439	*/
440	static void
441	default_reporter(enum loglevel level __attribute__ ((unused)),
442	const char format __attribute__* ((unused)),
443	...)
444	{
445	}
446	my_error_reporter my_charset_error_reporter= default_reporter;
447
448
449	/**
450	Wrappers for memory functions my_malloc (and friends)
451	with C-compatbile API without extra "myf" argument.
452	*/
453	static void *
454	my_once_alloc_c(size_t size)
455	{ return my_once_alloc(size, MYF(MY_WME)); }
456
457
458	static void *
459	my_malloc_c(size_t size)
460	{ return my_malloc(size, MYF(MY_WME)); }
461
462
463	static void *
464	my_realloc_c(void *old, size_t size)
465	{ return my_realloc(old, size, MYF(MY_WME\|MY_ALLOW_ZERO_PTR)); }
466
467
468	/**
469	Initialize character set loader to use mysys memory management functions.
470	@param loader Loader to initialize
471	*/
472	void
473	my_charset_loader_init_mysys(MY_CHARSET_LOADER *loader)
474	{
475	loader->error[`0`]= `'\0'`;
476	loader->once_alloc= my_once_alloc_c;
477	loader->malloc= my_malloc_c;
478	loader->realloc= my_realloc_c;
479	loader->free= my_free;
480	loader->reporter= my_charset_error_reporter;
481	loader->add_collation= add_collation;
482	}
483
484
485	#define MY_MAX_ALLOWED_BUF 1024*1024
486	#define MY_CHARSET_INDEX "Index.xml"
487
488	const char *charsets_dir= NULL;
489
490
491	static my_bool
492	my_read_charset_file(MY_CHARSET_LOADER *loader,
493	const char *filename,
494	myf myflags)
495	{
496	uchar *buf;
497	int fd;
498	size_t len, tmp_len;
499	MY_STAT stat_info;
500
501	if (!my_stat(filename, &stat_info, MYF(myflags)) \|\|
502	((len= (uint)stat_info.st_size) > MY_MAX_ALLOWED_BUF) \|\|
503	!(buf= (uchar*) my_malloc(len,myflags)))
504	return TRUE;
505
506	if ((fd= mysql_file_open(key_file_charset, filename, O_RDONLY, myflags)) < `0`)
507	goto error;
508	tmp_len= mysql_file_read(fd, buf, len, myflags);
509	mysql_file_close(fd, myflags);
510	if (tmp_len != len)
511	goto error;
512
513	if (my_parse_charset_xml(loader, (char *) buf, len))
514	{
515	my_printf_error(EE_UNKNOWN_CHARSET, "Error while parsing '%s': %s\n",
516	MYF(`0`), filename, loader->error);
517	goto error;
518	}
519
520	my_free(buf);
521	return FALSE;
522
523	error:
524	my_free(buf);
525	return TRUE;
526	}
527
528
529	char get_charsets_dir(char* *buf)
530	{
531	const char *sharedir= SHAREDIR;
532	char *res;
533	DBUG_ENTER("get_charsets_dir");
534
535	if (charsets_dir != NULL)
536	strmake(buf, charsets_dir, FN_REFLEN-`1`);
537	else
538	{
539	if (test_if_hard_path(sharedir) \|\|
540	is_prefix(sharedir, DEFAULT_CHARSET_HOME))
541	strxmov(buf, sharedir, "/", CHARSET_DIR, NullS);
542	else
543	strxmov(buf, DEFAULT_CHARSET_HOME, "/", sharedir, "/", CHARSET_DIR,
544	NullS);
545	}
546	res= convert_dirname(buf,buf,NullS);
547	DBUG_PRINT("info",("charsets dir: '%s'", buf));
548	DBUG_RETURN(res);
549	}
550
551	CHARSET_INFO *all_charsets[MY_ALL_CHARSETS_SIZE]={NULL};
552	CHARSET_INFO *default_charset_info = &my_charset_latin1;
553
554	void add_compiled_collation(struct charset_info_st *cs)
555	{
556	DBUG_ASSERT(cs->number < array_elements(all_charsets));
557	all_charsets[cs->number]= cs;
558	cs->state\|= MY_CS_AVAILABLE;
559	}
560
561
562	static my_pthread_once_t charsets_initialized= MY_PTHREAD_ONCE_INIT;
563	static my_pthread_once_t charsets_template= MY_PTHREAD_ONCE_INIT;
564
565	typedef struct
566	{
567	ulonglong use_count;
568	} MY_COLLATION_STATISTICS;
569
570
571	static MY_COLLATION_STATISTICS my_collation_statistics[MY_ALL_CHARSETS_SIZE];
572
573
574	my_bool my_collation_is_known_id(uint id)
575	{
576	return id > `0` && id < array_elements(all_charsets) && all_charsets[id] ?
577	TRUE : FALSE;
578	}
579
580
581	/*
582	Collation use statistics functions do not lock
583	counters to avoid mutex contention. This can lose
584	some counter increments with high thread concurrency.
585	But this should be Ok, as we don't need exact numbers.
586	*/
587	static inline void my_collation_statistics_inc_use_count(uint id)
588	{
589	DBUG_ASSERT(my_collation_is_known_id(id));
590	my_collation_statistics[id].use_count++;
591	}
592
593
594	ulonglong my_collation_statistics_get_use_count(uint id)
595	{
596	DBUG_ASSERT(my_collation_is_known_id(id));
597	return my_collation_statistics[id].use_count;
598	}
599
600
601	const char *my_collation_get_tailoring(uint id)
602	{
603	/ all_charsets[id]->tailoring is never changed after server startup. /
604	DBUG_ASSERT(my_collation_is_known_id(id));
605	return all_charsets[id]->tailoring;
606	}
607
608
609	static void init_available_charsets(void)
610	{
611	char fname[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
612	struct charset_info_st **cs;
613	MY_CHARSET_LOADER loader;
614
615	bzero((char) &all_charsets,sizeof*(all_charsets));
616	bzero((char) &my_collation_statistics, sizeof*(my_collation_statistics));
617	init_compiled_charsets(MYF(`0`));
618
619	/ Copy compiled charsets /
620	for (cs= (struct charset_info_st**) all_charsets;
621	cs < (struct charset_info_st**) all_charsets +
622	array_elements(all_charsets)-`1` ;
623	cs++)
624	{
625	if (*cs)
626	{
627	DBUG_ASSERT(cs[`0`]->mbmaxlen <= MY_CS_MBMAXLEN);
628	if (cs[`0`]->ctype)
629	if (init_state_maps(*cs))
630	*cs= NULL;
631	}
632	}
633
634	my_charset_loader_init_mysys(&loader);
635	strmov(get_charsets_dir(fname), MY_CHARSET_INDEX);
636	my_read_charset_file(&loader, fname, MYF(`0`));
637	}
638
639
640	void free_charsets(void)
641	{
642	charsets_initialized= charsets_template;
643	}
644
645
646	static const char*
647	get_collation_name_alias(const char name, char* *buf, size_t bufsize)
648	{
649	if (!strncasecmp(name, "utf8mb3_", `8`))
650	{
651	my_snprintf(buf, bufsize, "utf8_%s", name + `8`);
652	return buf;
653	}
654	return NULL;
655	}
656
657
658	uint get_collation_number(const char *name)
659	{
660	uint id;
661	char alias[`64`];
662	my_pthread_once(&charsets_initialized, init_available_charsets);
663	if ((id= get_collation_number_internal(name)))
664	return id;
665	if ((name= get_collation_name_alias(name, alias, sizeof(alias))))
666	return get_collation_number_internal(name);
667	return `0`;
668	}
669
670
671	static uint
672	get_charset_number_internal(const char *charset_name, uint cs_flags)
673	{
674	CHARSET_INFO **cs;
675
676	for (cs= all_charsets;
677	cs < all_charsets + array_elements(all_charsets);
678	cs++)
679	{
680	if ( cs[`0`] && cs[`0`]->csname && (cs[`0`]->state & cs_flags) &&
681	!my_strcasecmp(&my_charset_latin1, cs[`0`]->csname, charset_name))
682	return cs[`0`]->number;
683	}
684	return `0`;
685	}
686
687
688	static const char*
689	get_charset_name_alias(const char *name)
690	{
691	if (!my_strcasecmp(&my_charset_latin1, name, "utf8mb3"))
692	return "utf8";
693	return NULL;
694	}
695
696
697	uint get_charset_number(const char *charset_name, uint cs_flags)
698	{
699	uint id;
700	my_pthread_once(&charsets_initialized, init_available_charsets);
701	if ((id= get_charset_number_internal(charset_name, cs_flags)))
702	return id;
703	if ((charset_name= get_charset_name_alias(charset_name)))
704	return get_charset_number_internal(charset_name, cs_flags);
705	return `0`;
706	}
707
708
709	const char *get_charset_name(uint charset_number)
710	{
711	my_pthread_once(&charsets_initialized, init_available_charsets);
712
713	if (charset_number < array_elements(all_charsets))
714	{
715	CHARSET_INFO *cs= all_charsets[charset_number];
716
717	if (cs && (cs->number == charset_number) && cs->name)
718	return (char*) cs->name;
719	}
720
721	return "?"; / this mimics find_type() /
722	}
723
724
725	static CHARSET_INFO inheritance_source_by_id(CHARSET_INFO cs, uint refid)
726	{
727	CHARSET_INFO *refcs;
728	return refid && refid != cs->number &&
729	(refcs= all_charsets[refid]) &&
730	(refcs->state & MY_CS_AVAILABLE) ? refcs : NULL;
731	}
732
733
734	static CHARSET_INFO find_collation_data_inheritance_source(CHARSET_INFO cs)
735	{
736	const char beg, end;
737	if (cs->tailoring &&
738	!strncmp(cs->tailoring, "[import ", `8`) &&
739	(end= strchr(cs->tailoring + `8`, `']'`)) &&
740	(beg= cs->tailoring + `8`) + MY_CS_NAME_SIZE > end)
741	{
742	char name[MY_CS_NAME_SIZE + `1`];
743	memcpy(name, beg, end - beg);
744	name[end - beg]= `'\0'`;
745	return inheritance_source_by_id(cs, get_collation_number(name));
746	}
747	return NULL;
748	}
749
750
751	static CHARSET_INFO find_charset_data_inheritance_source(CHARSET_INFO cs)
752	{
753	uint refid= get_charset_number_internal(cs->csname, MY_CS_PRIMARY);
754	return inheritance_source_by_id(cs, refid);
755	}
756
757
758	static CHARSET_INFO *
759	get_internal_charset(MY_CHARSET_LOADER *loader, uint cs_number, myf flags)
760	{
761	char buf[FN_REFLEN];
762	struct charset_info_st *cs;
763
764	DBUG_ASSERT(cs_number < array_elements(all_charsets));
765
766	if ((cs= (struct charset_info_st*) all_charsets[cs_number]))
767	{
768	if (cs->state & MY_CS_READY) / if CS is already initialized /
769	{
770	my_collation_statistics_inc_use_count(cs_number);
771	return cs;
772	}
773
774	/*
775	To make things thread safe we are not allowing other threads to interfere
776	while we may changing the cs_info_table
777	*/
778	mysql_mutex_lock(&THR_LOCK_charset);
779
780	if (!(cs->state & (MY_CS_COMPILED\|MY_CS_LOADED))) / if CS is not in memory /
781	{
782	MY_CHARSET_LOADER loader;
783	strxmov(get_charsets_dir(buf), cs->csname, ".xml", NullS);
784	my_charset_loader_init_mysys(&loader);
785	my_read_charset_file(&loader, buf, flags);
786	}
787
788	if (cs->state & MY_CS_AVAILABLE)
789	{
790	if (!(cs->state & MY_CS_READY))
791	{
792	if (!simple_8bit_charset_data_is_full(cs))
793	{
794	CHARSET_INFO *refcs= find_charset_data_inheritance_source(cs);
795	if (refcs)
796	inherit_charset_data(cs, refcs);
797	}
798	if (!simple_8bit_collation_data_is_full(cs))
799	{
800	CHARSET_INFO *refcl= find_collation_data_inheritance_source(cs);
801	if (refcl)
802	inherit_collation_data(cs, refcl);
803	}
804
805	if ((cs->cset->init && cs->cset->init(cs, loader)) \|\|
806	(cs->coll->init && cs->coll->init(cs, loader)))
807	{
808	cs= NULL;
809	}
810	else
811	cs->state\|= MY_CS_READY;
812	}
813	my_collation_statistics_inc_use_count(cs_number);
814	}
815	else
816	cs= NULL;
817
818	mysql_mutex_unlock(&THR_LOCK_charset);
819	}
820	return cs;
821	}
822
823
824	CHARSET_INFO *get_charset(uint cs_number, myf flags)
825	{
826	CHARSET_INFO *cs= NULL;
827
828	if (cs_number == default_charset_info->number)
829	return default_charset_info;
830
831	my_pthread_once(&charsets_initialized, init_available_charsets);
832
833	if (cs_number < array_elements(all_charsets))
834	{
835	MY_CHARSET_LOADER loader;
836	my_charset_loader_init_mysys(&loader);
837	cs= get_internal_charset(&loader, cs_number, flags);
838	}
839
840	if (!cs && (flags & MY_WME))
841	{
842	char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)], cs_string[`23`];
843	strmov(get_charsets_dir(index_file),MY_CHARSET_INDEX);
844	cs_string[`0`]=`'#'`;
845	int10_to_str(cs_number, cs_string+`1`, `10`);
846	my_error(EE_UNKNOWN_CHARSET, MYF(ME_BELL), cs_string, index_file);
847	}
848	return cs;
849	}
850
851
852	/**
853	Find collation by name: extended version of get_charset_by_name()
854	to return error messages to the caller.
855	@param loader Character set loader
856	@param name Collation name
857	@param flags Flags
858	@return NULL on error, pointer to collation on success
859	*/
860
861	CHARSET_INFO *
862	my_collation_get_by_name(MY_CHARSET_LOADER *loader,
863	const char *name, myf flags)
864	{
865	uint cs_number;
866	CHARSET_INFO *cs;
867	my_pthread_once(&charsets_initialized, init_available_charsets);
868
869	cs_number= get_collation_number(name);
870	my_charset_loader_init_mysys(loader);
871	cs= cs_number ? get_internal_charset(loader, cs_number, flags) : NULL;
872
873	if (!cs && (flags & MY_WME))
874	{
875	char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
876	strmov(get_charsets_dir(index_file),MY_CHARSET_INDEX);
877	my_error(EE_UNKNOWN_COLLATION, MYF(ME_BELL), name, index_file);
878	}
879	return cs;
880	}
881
882
883	CHARSET_INFO get_charset_by_name(const* char *cs_name, myf flags)
884	{
885	MY_CHARSET_LOADER loader;
886	my_charset_loader_init_mysys(&loader);
887	return my_collation_get_by_name(&loader, cs_name, flags);
888	}
889
890
891	/**
892	Find character set by name: extended version of get_charset_by_csname()
893	to return error messages to the caller.
894	@param loader Character set loader
895	@param name Collation name
896	@param cs_flags Character set flags (e.g. default or binary collation)
897	@param flags Flags
898	@return NULL on error, pointer to collation on success
899	*/
900	CHARSET_INFO *
901	my_charset_get_by_name(MY_CHARSET_LOADER *loader,
902	const char *cs_name, uint cs_flags, myf flags)
903	{
904	uint cs_number;
905	CHARSET_INFO *cs;
906	DBUG_ENTER("get_charset_by_csname");
907	DBUG_PRINT("enter",("name: '%s'", cs_name));
908
909	my_pthread_once(&charsets_initialized, init_available_charsets);
910
911	cs_number= get_charset_number(cs_name, cs_flags);
912	cs= cs_number ? get_internal_charset(loader, cs_number, flags) : NULL;
913
914	if (!cs && (flags & MY_WME))
915	{
916	char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
917	strmov(get_charsets_dir(index_file),MY_CHARSET_INDEX);
918	my_error(EE_UNKNOWN_CHARSET, MYF(ME_BELL), cs_name, index_file);
919	}
920
921	DBUG_RETURN(cs);
922	}
923
924
925	CHARSET_INFO *
926	get_charset_by_csname(const char *cs_name, uint cs_flags, myf flags)
927	{
928	MY_CHARSET_LOADER loader;
929	my_charset_loader_init_mysys(&loader);
930	return my_charset_get_by_name(&loader, cs_name, cs_flags, flags);
931	}
932
933
934	/**
935	Resolve character set by the character set name (utf8, latin1, ...).
936
937	The function tries to resolve character set by the specified name. If
938	there is character set with the given name, it is assigned to the "cs"
939	parameter and FALSE is returned. If there is no such character set,
940	"default_cs" is assigned to the "cs" and TRUE is returned.
941
942	@param[in] cs_name Character set name.
943	@param[in] default_cs Default character set.
944	@param[out] cs Variable to store character set.
945
946	@return FALSE if character set was resolved successfully; TRUE if there
947	is no character set with given name.
948	*/
949
950	my_bool resolve_charset(const char *cs_name,
951	CHARSET_INFO *default_cs,
952	CHARSET_INFO **cs)
953	{
954	*cs= get_charset_by_csname(cs_name, MY_CS_PRIMARY, MYF(`0`));
955
956	if (*cs == NULL)
957	{
958	*cs= default_cs;
959	return TRUE;
960	}
961
962	return FALSE;
963	}
964
965
966	/**
967	Resolve collation by the collation name (utf8_general_ci, ...).
968
969	The function tries to resolve collation by the specified name. If there
970	is collation with the given name, it is assigned to the "cl" parameter
971	and FALSE is returned. If there is no such collation, "default_cl" is
972	assigned to the "cl" and TRUE is returned.
973
974	@param[out] cl Variable to store collation.
975	@param[in] cl_name Collation name.
976	@param[in] default_cl Default collation.
977
978	@return FALSE if collation was resolved successfully; TRUE if there is no
979	collation with given name.
980	*/
981
982	my_bool resolve_collation(const char *cl_name,
983	CHARSET_INFO *default_cl,
984	CHARSET_INFO **cl)
985	{
986	*cl= get_charset_by_name(cl_name, MYF(`0`));
987
988	if (*cl == NULL)
989	{
990	*cl= default_cl;
991	return TRUE;
992	}
993
994	return FALSE;
995	}
996
997
998	/*
999	Escape string with backslashes (\)
1000
1001	SYNOPSIS
1002	escape_string_for_mysql()
1003	charset_info Charset of the strings
1004	to Buffer for escaped string
1005	to_length Length of destination buffer, or 0
1006	from The string to escape
1007	length The length of the string to escape
1008
1009	DESCRIPTION
1010	This escapes the contents of a string by adding backslashes before special
1011	characters, and turning others into specific escape sequences, such as
1012	turning newlines into \n and null bytes into \0.
1013
1014	NOTE
1015	To maintain compatibility with the old C API, to_length may be 0 to mean
1016	"big enough"
1017
1018	RETURN VALUES
1019	(size_t) -1 The escaped string did not fit in the to buffer
1020	# The length of the escaped string
1021	*/
1022
1023	size_t escape_string_for_mysql(CHARSET_INFO *charset_info,
1024	char *to, size_t to_length,
1025	const char *from, size_t length)
1026	{
1027	const char *to_start= to;
1028	const char end, to_end=to_start + (to_length ? to_length-`1` : `2`*length);
1029	my_bool overflow= FALSE;
1030	for (end= from + length; from < end; from++)
1031	{
1032	char escape= `0`;
1033	#ifdef USE_MB
1034	int tmp_length= use_mb(charset_info) ? my_charlen(charset_info, from, end) :
1035	`1`;
1036	if (tmp_length > `1`)
1037	{
1038	if (to + tmp_length > to_end)
1039	{
1040	overflow= TRUE;
1041	break;
1042	}
1043	while (tmp_length--)
1044	to++= from++;
1045	from--;
1046	continue;
1047	}
1048	/*
1049	If the next character appears to begin a multi-byte character, we
1050	escape that first byte of that apparent multi-byte character. (The
1051	character just looks like a multi-byte character -- if it were actually
1052	a multi-byte character, it would have been passed through in the test
1053	above.)
1054
1055	Without this check, we can create a problem by converting an invalid
1056	multi-byte character into a valid one. For example, 0xbf27 is not
1057	a valid GBK character, but 0xbf5c is. (0x27 = ', 0x5c = \)
1058	*/
1059	if (tmp_length < `1`) / Bad byte sequence /
1060	escape= *from;
1061	else
1062	#endif
1063	switch (*from) {
1064	case `0`: / Must be escaped for 'mysql' /
1065	escape= `'0'`;
1066	break;
1067	case `'\n'`: / Must be escaped for logs /
1068	escape= `'n'`;
1069	break;
1070	case `'\r'`:
1071	escape= `'r'`;
1072	break;
1073	case `'\\'`:
1074	escape= `'\\'`;
1075	break;
1076	case `'\''`:
1077	escape= `'\''`;
1078	break;
1079	case `'"'`: / Better safe than sorry /
1080	escape= `'"'`;
1081	break;
1082	case `'\032'`: / This gives problems on Win32 /
1083	escape= `'Z'`;
1084	break;
1085	}
1086	if (escape)
1087	{
1088	if (to + `2` > to_end)
1089	{
1090	overflow= TRUE;
1091	break;
1092	}
1093	*to++= `'\\'`;
1094	*to++= escape;
1095	}
1096	else
1097	{
1098	if (to + `1` > to_end)
1099	{
1100	overflow= TRUE;
1101	break;
1102	}
1103	to++= from;
1104	}
1105	}
1106	*to= `0`;
1107	return overflow ? (size_t) -`1` : (size_t) (to - to_start);
1108	}
1109
1110
1111	#ifdef BACKSLASH_MBTAIL
1112	static CHARSET_INFO *fs_cset_cache= NULL;
1113
1114	CHARSET_INFO *fs_character_set()
1115	{
1116	if (!fs_cset_cache)
1117	{
1118	char buf[`10`]= "cp";
1119	GetLocaleInfo(LOCALE_SYSTEM_DEFAULT, LOCALE_IDEFAULTANSICODEPAGE,
1120	buf+`2`, sizeof(buf)-`3`);
1121	/*
1122	We cannot call get_charset_by_name here
1123	because fs_character_set() is executed before
1124	LOCK_THD_charset mutex initialization, which
1125	is used inside get_charset_by_name.
1126	As we're now interested in cp932 only,
1127	let's just detect it using strcmp().
1128	*/
1129	fs_cset_cache=
1130	#ifdef HAVE_CHARSET_cp932
1131	!strcmp(buf, "cp932") ? &my_charset_cp932_japanese_ci :
1132	#endif
1133	&my_charset_bin;
1134	}
1135	return fs_cset_cache;
1136	}
1137	#endif
1138
1139	/*
1140	Escape apostrophes by doubling them up
1141
1142	SYNOPSIS
1143	escape_quotes_for_mysql()
1144	charset_info Charset of the strings
1145	to Buffer for escaped string
1146	to_length Length of destination buffer, or 0
1147	from The string to escape
1148	length The length of the string to escape
1149
1150	DESCRIPTION
1151	This escapes the contents of a string by doubling up any apostrophes that
1152	it contains. This is used when the NO_BACKSLASH_ESCAPES SQL_MODE is in
1153	effect on the server.
1154
1155	NOTE
1156	To be consistent with escape_string_for_mysql(), to_length may be 0 to
1157	mean "big enough"
1158
1159	RETURN VALUES
1160	~0 The escaped string did not fit in the to buffer
1161	>=0 The length of the escaped string
1162	*/
1163
1164	size_t escape_quotes_for_mysql(CHARSET_INFO *charset_info,
1165	char *to, size_t to_length,
1166	const char *from, size_t length)
1167	{
1168	const char *to_start= to;
1169	const char end, to_end=to_start + (to_length ? to_length-`1` : `2`*length);
1170	my_bool overflow= FALSE;
1171	#ifdef USE_MB
1172	my_bool use_mb_flag= use_mb(charset_info);
1173	#endif
1174	for (end= from + length; from < end; from++)
1175	{
1176	#ifdef USE_MB
1177	int tmp_length;
1178	if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
1179	{
1180	if (to + tmp_length > to_end)
1181	{
1182	overflow= TRUE;
1183	break;
1184	}
1185	while (tmp_length--)
1186	to++= from++;
1187	from--;
1188	continue;
1189	}
1190	/*
1191	We don't have the same issue here with a non-multi-byte character being
1192	turned into a multi-byte character by the addition of an escaping
1193	character, because we are only escaping the ' character with itself.
1194	*/
1195	#endif
1196	if (*from == `'\''`)
1197	{
1198	if (to + `2` > to_end)
1199	{
1200	overflow= TRUE;
1201	break;
1202	}
1203	*to++= `'\''`;
1204	*to++= `'\''`;
1205	}
1206	else
1207	{
1208	if (to + `1` > to_end)
1209	{
1210	overflow= TRUE;
1211	break;
1212	}
1213	to++= from;
1214	}
1215	}
1216	*to= `0`;
1217	return overflow ? (ulong)~`0` : (ulong) (to - to_start);
1218	}
1219

Browse the source code of MariaDB/mysys/charset.c