ctype.c source code [MariaDB/strings/ctype.c]

1	/ Copyright (c) 2000, 2013, Oracle and/or its affiliates.*
2	Copyright (c) 2009, 2014, SkySQL Ab.
3
4	This program is free software; you can redistribute it and/or modify
5	it under the terms of the GNU General Public License as published by
6	the Free Software Foundation; version 2 of the License.
7
8	This program is distributed in the hope that it will be useful,
9	but WITHOUT ANY WARRANTY; without even the implied warranty of
10	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11	GNU General Public License for more details.
12
13	You should have received a copy of the GNU General Public License
14	along with this program; if not, write to the Free Software
15	Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA /*
16
17	#include "strings_def.h"
18	#include <m_ctype.h>
19	#include <my_xml.h>
20
21	/*
22
23	This files implements routines which parse XML based
24	character set and collation description files.
25
26	Unicode collations are encoded according to
27
28	Unicode Technical Standard #35
29	Locale Data Markup Language (LDML)
30	http://www.unicode.org/reports/tr35/
31
32	and converted into ICU string according to
33
34	Collation Customization
35	http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
36
37	*/
38
39
40	/*
41	Avoid using my_snprintf
42	We cannot use my_snprintf() here, because ctype.o is
43	used to build conf_to_src, which must require minimum
44	dependency.
45	*/
46
47	#undef my_snprinf
48	#define my_snprintf "We cannot use my_snprintf in this file"
49
50
51	int (my_string_stack_guard)(int*)= NULL;
52
53	static char mstr(char* str,const* char *src,size_t l1,size_t l2)
54	{
55	l1= l1<l2 ? l1 : l2;
56	memcpy(str,src,l1);
57	str[l1]=`'\0'`;
58	return str;
59	}
60
61	struct my_cs_file_section_st
62	{
63	int state;
64	const char *str;
65	};
66
67	#define _CS_MISC 1
68	#define _CS_ID 2
69	#define _CS_CSNAME 3
70	#define _CS_FAMILY 4
71	#define _CS_ORDER 5
72	#define _CS_COLNAME 6
73	#define _CS_FLAG 7
74	#define _CS_CHARSET 8
75	#define _CS_COLLATION 9
76	#define _CS_UPPERMAP 10
77	#define _CS_LOWERMAP 11
78	#define _CS_UNIMAP 12
79	#define _CS_COLLMAP 13
80	#define _CS_CTYPEMAP 14
81	#define _CS_PRIMARY_ID 15
82	#define _CS_BINARY_ID 16
83	#define _CS_CSDESCRIPT 17
84
85
86	/ Special purpose commands /
87	#define _CS_UCA_VERSION 100
88	#define _CS_CL_SUPPRESS_CONTRACTIONS 101
89	#define _CS_CL_OPTIMIZE 102
90	#define _CS_CL_SHIFT_AFTER_METHOD 103
91	#define _CS_CL_RULES_IMPORT 104
92	#define _CS_CL_RULES_IMPORT_SOURCE 105
93
94
95	/ Collation Settings /
96	#define _CS_ST_SETTINGS 200
97	#define _CS_ST_STRENGTH 201
98	#define _CS_ST_ALTERNATE 202
99	#define _CS_ST_BACKWARDS 203
100	#define _CS_ST_NORMALIZATION 204
101	#define _CS_ST_CASE_LEVEL 205
102	#define _CS_ST_CASE_FIRST 206
103	#define _CS_ST_HIRAGANA_QUATERNARY 207
104	#define _CS_ST_NUMERIC 208
105	#define _CS_ST_VARIABLE_TOP 209
106	#define _CS_ST_MATCH_BOUNDARIES 210
107	#define _CS_ST_MATCH_STYLE 211
108
109
110	/ Rules /
111	#define _CS_RULES 300
112	#define _CS_RESET 301
113	#define _CS_DIFF1 302
114	#define _CS_DIFF2 303
115	#define _CS_DIFF3 304
116	#define _CS_DIFF4 305
117	#define _CS_IDENTICAL 306
118
119	/ Rules: Expansions /
120	#define _CS_EXP_X 320
121	#define _CS_EXP_EXTEND 321
122	#define _CS_EXP_DIFF1 322
123	#define _CS_EXP_DIFF2 323
124	#define _CS_EXP_DIFF3 324
125	#define _CS_EXP_DIFF4 325
126	#define _CS_EXP_IDENTICAL 326
127
128	/ Rules: Abbreviating Ordering Specifications /
129	#define _CS_A_DIFF1 351
130	#define _CS_A_DIFF2 352
131	#define _CS_A_DIFF3 353
132	#define _CS_A_DIFF4 354
133	#define _CS_A_IDENTICAL 355
134
135	/ Rules: previous context /
136	#define _CS_CONTEXT 370
137
138	/ Rules: Placing Characters Before Others/
139	#define _CS_RESET_BEFORE 380
140
141	/ Rules: Logical Reset Positions /
142	#define _CS_RESET_FIRST_PRIMARY_IGNORABLE 401
143	#define _CS_RESET_LAST_PRIMARY_IGNORABLE 402
144	#define _CS_RESET_FIRST_SECONDARY_IGNORABLE 403
145	#define _CS_RESET_LAST_SECONDARY_IGNORABLE 404
146	#define _CS_RESET_FIRST_TERTIARY_IGNORABLE 405
147	#define _CS_RESET_LAST_TERTIARY_IGNORABLE 406
148	#define _CS_RESET_FIRST_TRAILING 407
149	#define _CS_RESET_LAST_TRAILING 408
150	#define _CS_RESET_FIRST_VARIABLE 409
151	#define _CS_RESET_LAST_VARIABLE 410
152	#define _CS_RESET_FIRST_NON_IGNORABLE 411
153	#define _CS_RESET_LAST_NON_IGNORABLE 412
154
155
156
157	static const struct my_cs_file_section_st sec[] =
158	{
159	{_CS_MISC, "xml"},
160	{_CS_MISC, "xml/version"},
161	{_CS_MISC, "xml/encoding"},
162	{_CS_MISC, "charsets"},
163	{_CS_MISC, "charsets/max-id"},
164	{_CS_MISC, "charsets/copyright"},
165	{_CS_MISC, "charsets/description"},
166	{_CS_CHARSET, "charsets/charset"},
167	{_CS_PRIMARY_ID, "charsets/charset/primary-id"},
168	{_CS_BINARY_ID, "charsets/charset/binary-id"},
169	{_CS_CSNAME, "charsets/charset/name"},
170	{_CS_FAMILY, "charsets/charset/family"},
171	{_CS_CSDESCRIPT, "charsets/charset/description"},
172	{_CS_MISC, "charsets/charset/alias"},
173	{_CS_MISC, "charsets/charset/ctype"},
174	{_CS_CTYPEMAP, "charsets/charset/ctype/map"},
175	{_CS_MISC, "charsets/charset/upper"},
176	{_CS_UPPERMAP, "charsets/charset/upper/map"},
177	{_CS_MISC, "charsets/charset/lower"},
178	{_CS_LOWERMAP, "charsets/charset/lower/map"},
179	{_CS_MISC, "charsets/charset/unicode"},
180	{_CS_UNIMAP, "charsets/charset/unicode/map"},
181	{_CS_COLLATION, "charsets/charset/collation"},
182	{_CS_COLNAME, "charsets/charset/collation/name"},
183	{_CS_ID, "charsets/charset/collation/id"},
184	{_CS_ORDER, "charsets/charset/collation/order"},
185	{_CS_FLAG, "charsets/charset/collation/flag"},
186	{_CS_COLLMAP, "charsets/charset/collation/map"},
187
188	/ Special purpose commands /
189	{_CS_UCA_VERSION, "charsets/charset/collation/version"},
190	{_CS_CL_SUPPRESS_CONTRACTIONS, "charsets/charset/collation/suppress_contractions"},
191	{_CS_CL_OPTIMIZE, "charsets/charset/collation/optimize"},
192	{_CS_CL_SHIFT_AFTER_METHOD, "charsets/charset/collation/shift-after-method"},
193	{_CS_CL_RULES_IMPORT, "charsets/charset/collation/rules/import"},
194	{_CS_CL_RULES_IMPORT_SOURCE, "charsets/charset/collation/rules/import/source"},
195
196	/ Collation Settings /
197	{_CS_ST_SETTINGS, "charsets/charset/collation/settings"},
198	{_CS_ST_STRENGTH, "charsets/charset/collation/settings/strength"},
199	{_CS_ST_ALTERNATE, "charsets/charset/collation/settings/alternate"},
200	{_CS_ST_BACKWARDS, "charsets/charset/collation/settings/backwards"},
201	{_CS_ST_NORMALIZATION, "charsets/charset/collation/settings/normalization"},
202	{_CS_ST_CASE_LEVEL, "charsets/charset/collation/settings/caseLevel"},
203	{_CS_ST_CASE_FIRST, "charsets/charset/collation/settings/caseFirst"},
204	{_CS_ST_HIRAGANA_QUATERNARY, "charsets/charset/collation/settings/hiraganaQuaternary"},
205	{_CS_ST_NUMERIC, "charsets/charset/collation/settings/numeric"},
206	{_CS_ST_VARIABLE_TOP, "charsets/charset/collation/settings/variableTop"},
207	{_CS_ST_MATCH_BOUNDARIES, "charsets/charset/collation/settings/match-boundaries"},
208	{_CS_ST_MATCH_STYLE, "charsets/charset/collation/settings/match-style"},
209
210	/ Rules /
211	{_CS_RULES, "charsets/charset/collation/rules"},
212	{_CS_RESET, "charsets/charset/collation/rules/reset"},
213	{_CS_DIFF1, "charsets/charset/collation/rules/p"},
214	{_CS_DIFF2, "charsets/charset/collation/rules/s"},
215	{_CS_DIFF3, "charsets/charset/collation/rules/t"},
216	{_CS_DIFF4, "charsets/charset/collation/rules/q"},
217	{_CS_IDENTICAL, "charsets/charset/collation/rules/i"},
218
219	/ Rules: expansions /
220	{_CS_EXP_X, "charsets/charset/collation/rules/x"},
221	{_CS_EXP_EXTEND, "charsets/charset/collation/rules/x/extend"},
222	{_CS_EXP_DIFF1, "charsets/charset/collation/rules/x/p"},
223	{_CS_EXP_DIFF2, "charsets/charset/collation/rules/x/s"},
224	{_CS_EXP_DIFF3, "charsets/charset/collation/rules/x/t"},
225	{_CS_EXP_DIFF4, "charsets/charset/collation/rules/x/q"},
226	{_CS_EXP_IDENTICAL, "charsets/charset/collation/rules/x/i"},
227
228	/ Rules: previous context /
229	{_CS_CONTEXT, "charsets/charset/collation/rules/x/context"},
230
231	/ Rules: Abbreviating Ordering Specifications /
232	{_CS_A_DIFF1, "charsets/charset/collation/rules/pc"},
233	{_CS_A_DIFF2, "charsets/charset/collation/rules/sc"},
234	{_CS_A_DIFF3, "charsets/charset/collation/rules/tc"},
235	{_CS_A_DIFF4, "charsets/charset/collation/rules/qc"},
236	{_CS_A_IDENTICAL, "charsets/charset/collation/rules/ic"},
237
238	/ Rules: Placing Characters Before Others/
239	{_CS_RESET_BEFORE, "charsets/charset/collation/rules/reset/before"},
240
241	/ Rules: Logical Reset Positions /
242	{_CS_RESET_FIRST_NON_IGNORABLE, "charsets/charset/collation/rules/reset/first_non_ignorable"},
243	{_CS_RESET_LAST_NON_IGNORABLE, "charsets/charset/collation/rules/reset/last_non_ignorable"},
244	{_CS_RESET_FIRST_PRIMARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_primary_ignorable"},
245	{_CS_RESET_LAST_PRIMARY_IGNORABLE, "charsets/charset/collation/rules/reset/last_primary_ignorable"},
246	{_CS_RESET_FIRST_SECONDARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_secondary_ignorable"},
247	{_CS_RESET_LAST_SECONDARY_IGNORABLE, "charsets/charset/collation/rules/reset/last_secondary_ignorable"},
248	{_CS_RESET_FIRST_TERTIARY_IGNORABLE, "charsets/charset/collation/rules/reset/first_tertiary_ignorable"},
249	{_CS_RESET_LAST_TERTIARY_IGNORABLE, "charsets/charset/collation/rules/reset/last_tertiary_ignorable"},
250	{_CS_RESET_FIRST_TRAILING, "charsets/charset/collation/rules/reset/first_trailing"},
251	{_CS_RESET_LAST_TRAILING, "charsets/charset/collation/rules/reset/last_trailing"},
252	{_CS_RESET_FIRST_VARIABLE, "charsets/charset/collation/rules/reset/first_variable"},
253	{_CS_RESET_LAST_VARIABLE, "charsets/charset/collation/rules/reset/last_variable"},
254
255	{`0`, NULL}
256	};
257
258	static const struct my_cs_file_section_st
259	cs_file_sec(const* char *attr, size_t len)
260	{
261	const struct my_cs_file_section_st *s;
262	for (s=sec; s->str; s++)
263	{
264	if (!strncmp(attr, s->str, len) && s->str[len] == `0`)
265	return s;
266	}
267	return NULL;
268	}
269
270	#define MY_CS_CSDESCR_SIZE 64
271	#define MY_CS_TAILORING_SIZE (32*1024)
272	#define MY_CS_UCA_VERSION_SIZE 64
273	#define MY_CS_CONTEXT_SIZE 64
274
275	typedef struct my_cs_file_info
276	{
277	char csname[MY_CS_NAME_SIZE];
278	char name[MY_CS_NAME_SIZE];
279	uchar ctype[MY_CS_CTYPE_TABLE_SIZE];
280	uchar to_lower[MY_CS_TO_LOWER_TABLE_SIZE];
281	uchar to_upper[MY_CS_TO_UPPER_TABLE_SIZE];
282	uchar sort_order[MY_CS_SORT_ORDER_TABLE_SIZE];
283	uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE];
284	char comment[MY_CS_CSDESCR_SIZE];
285	char *tailoring;
286	size_t tailoring_length;
287	size_t tailoring_alloced_length;
288	char context[MY_CS_CONTEXT_SIZE];
289	struct charset_info_st cs;
290	MY_CHARSET_LOADER *loader;
291	} MY_CHARSET_FILE;
292
293
294	static void
295	my_charset_file_reset_charset(MY_CHARSET_FILE *i)
296	{
297	memset(&i->cs, `0`, sizeof(i->cs));
298	}
299
300
301	static void
302	my_charset_file_reset_collation(MY_CHARSET_FILE *i)
303	{
304	i->tailoring_length= `0`;
305	i->context[`0`]= `'\0'`;
306	}
307
308
309	static void
310	my_charset_file_init(MY_CHARSET_FILE *i)
311	{
312	my_charset_file_reset_charset(i);
313	my_charset_file_reset_collation(i);
314	i->tailoring= NULL;
315	i->tailoring_alloced_length= `0`;
316	}
317
318
319	static void
320	my_charset_file_free(MY_CHARSET_FILE *i)
321	{
322	i->loader->free(i->tailoring);
323	}
324
325
326	static int
327	my_charset_file_tailoring_realloc(MY_CHARSET_FILE *i, size_t newlen)
328	{
329	if (i->tailoring_alloced_length > newlen \|\|
330	(i->tailoring= i->loader->realloc(i->tailoring,
331	(i->tailoring_alloced_length=
332	(newlen + `32`*`1024`)))))
333	{
334	return MY_XML_OK;
335	}
336	return MY_XML_ERROR;
337	}
338
339
340	static int fill_uchar(uchar a,uint size,const* char *str, size_t len)
341	{
342	uint i= `0`;
343	const char s, b, *e=str+len;
344
345	for (s=str ; s < e ; i++)
346	{
347	for ( ; (s < e) && strchr(" \t\r\n",s[`0`]); s++) ;
348	b=s;
349	for ( ; (s < e) && !strchr(" \t\r\n",s[`0`]); s++) ;
350	if (s == b \|\| i > size)
351	break;
352	a[i]= (uchar) strtoul(b,NULL,`16`);
353	}
354	return `0`;
355	}
356
357	static int fill_uint16(uint16 a,uint size,const* char *str, size_t len)
358	{
359	uint i= `0`;
360
361	const char s, b, *e=str+len;
362	for (s=str ; s < e ; i++)
363	{
364	for ( ; (s < e) && strchr(" \t\r\n",s[`0`]); s++) ;
365	b=s;
366	for ( ; (s < e) && !strchr(" \t\r\n",s[`0`]); s++) ;
367	if (s == b \|\| i > size)
368	break;
369	a[i]= (uint16) strtol(b,NULL,`16`);
370	}
371	return `0`;
372	}
373
374
375
376
377	static int
378	tailoring_append(MY_XML_PARSER *st,
379	const char fmt, size_t len, const* char *attr)
380	{
381	struct my_cs_file_info i= (struct* my_cs_file_info *) st->user_data;
382	size_t newlen= i->tailoring_length + len + `64`; / 64 for format /
383	if (MY_XML_OK == my_charset_file_tailoring_realloc(i, newlen))
384	{
385	char *dst= i->tailoring + i->tailoring_length;
386	sprintf(dst, fmt, (int) len, attr);
387	i->tailoring_length+= strlen(dst);
388	return MY_XML_OK;
389	}
390	return MY_XML_ERROR;
391	}
392
393
394	static int
395	tailoring_append2(MY_XML_PARSER *st,
396	const char *fmt,
397	size_t len1, const char *attr1,
398	size_t len2, const char *attr2)
399	{
400	struct my_cs_file_info i= (struct* my_cs_file_info *) st->user_data;
401	size_t newlen= i->tailoring_length + len1 + len2 + `64`; / 64 for format /
402	if (MY_XML_OK == my_charset_file_tailoring_realloc(i, newlen))
403	{
404	char *dst= i->tailoring + i->tailoring_length;
405	sprintf(dst, fmt, (int) len1, attr1, (int) len2, attr2);
406	i->tailoring_length+= strlen(dst);
407	return MY_XML_OK;
408	}
409	return MY_XML_ERROR;
410	}
411
412
413	static size_t
414	scan_one_character(const char s, const* char e, my_wc_t wc)
415	{
416	CHARSET_INFO *cs= &my_charset_utf8_general_ci;
417	if (s >= e)
418	return `0`;
419
420	/ Escape sequence: \uXXXX /
421	if (s[`0`] == `'\\'` && s + `2` < e && s[`1`] == `'u'` && my_isxdigit(cs, s[`2`]))
422	{
423	size_t len= `3`; / We have at least one digit /
424	for (s+= `3`; s < e && my_isxdigit(cs, s[`0`]); s++, len++)
425	{
426	}
427	wc[`0`]= `0`;
428	return len;
429	}
430	else if ((int8) s[`0`] > `0`) / 7-bit character /
431	{
432	wc[`0`]= `0`;
433	return `1`;
434	}
435	else / Non-escaped character /
436	{
437	int rc= cs->cset->mb_wc(cs, wc, (uchar ) s, (uchar ) e);
438	if (rc > `0`)
439	return (size_t) rc;
440	}
441	return `0`;
442	}
443
444
445	static int
446	tailoring_append_abbreviation(MY_XML_PARSER *st,
447	const char fmt, size_t len, const* char *attr)
448	{
449	size_t clen;
450	const char *attrend= attr + len;
451	my_wc_t wc;
452
453	for ( ; (clen= scan_one_character(attr, attrend, &wc)) > `0`; attr+= clen)
454	{
455	DBUG_ASSERT(attr < attrend);
456	if (tailoring_append(st, fmt, clen, attr) != MY_XML_OK)
457	return MY_XML_ERROR;
458	}
459	return MY_XML_OK;
460	}
461
462
463	static int cs_enter(MY_XML_PARSER st,const* char *attr, size_t len)
464	{
465	struct my_cs_file_info i= (struct* my_cs_file_info *)st->user_data;
466	const struct my_cs_file_section_st *s= cs_file_sec(attr,len);
467	int state= s ? s->state : `0`;
468
469	switch (state) {
470	case `0`:
471	i->loader->reporter(WARNING_LEVEL, "Unknown LDML tag: '%.*s'", len, attr);
472	break;
473
474	case _CS_CHARSET:
475	my_charset_file_reset_charset(i);
476	break;
477
478	case _CS_COLLATION:
479	my_charset_file_reset_collation(i);
480	break;
481
482	case _CS_RESET:
483	return tailoring_append(st, " &", `0`, NULL);
484
485	default:
486	break;
487	}
488	return MY_XML_OK;
489	}
490
491
492	static int cs_leave(MY_XML_PARSER st,const* char *attr, size_t len)
493	{
494	struct my_cs_file_info i= (struct* my_cs_file_info *)st->user_data;
495	const struct my_cs_file_section_st *s= cs_file_sec(attr,len);
496	int state= s ? s->state : `0`;
497	int rc;
498
499	switch(state){
500	case _CS_COLLATION:
501	if (i->tailoring_length)
502	i->cs.tailoring= i->tailoring;
503	rc= i->loader->add_collation ? i->loader->add_collation(&i->cs) : MY_XML_OK;
504	break;
505
506	/ Rules: Logical Reset Positions /
507	case _CS_RESET_FIRST_NON_IGNORABLE:
508	rc= tailoring_append(st, "[first non-ignorable]", `0`, NULL);
509	break;
510
511	case _CS_RESET_LAST_NON_IGNORABLE:
512	rc= tailoring_append(st, "[last non-ignorable]", `0`, NULL);
513	break;
514
515	case _CS_RESET_FIRST_PRIMARY_IGNORABLE:
516	rc= tailoring_append(st, "[first primary ignorable]", `0`, NULL);
517	break;
518
519	case _CS_RESET_LAST_PRIMARY_IGNORABLE:
520	rc= tailoring_append(st, "[last primary ignorable]", `0`, NULL);
521	break;
522
523	case _CS_RESET_FIRST_SECONDARY_IGNORABLE:
524	rc= tailoring_append(st, "[first secondary ignorable]", `0`, NULL);
525	break;
526
527	case _CS_RESET_LAST_SECONDARY_IGNORABLE:
528	rc= tailoring_append(st, "[last secondary ignorable]", `0`, NULL);
529	break;
530
531	case _CS_RESET_FIRST_TERTIARY_IGNORABLE:
532	rc= tailoring_append(st, "[first tertiary ignorable]", `0`, NULL);
533	break;
534
535	case _CS_RESET_LAST_TERTIARY_IGNORABLE:
536	rc= tailoring_append(st, "[last tertiary ignorable]", `0`, NULL);
537	break;
538
539	case _CS_RESET_FIRST_TRAILING:
540	rc= tailoring_append(st, "[first trailing]", `0`, NULL);
541	break;
542
543	case _CS_RESET_LAST_TRAILING:
544	rc= tailoring_append(st, "[last trailing]", `0`, NULL);
545	break;
546
547	case _CS_RESET_FIRST_VARIABLE:
548	rc= tailoring_append(st, "[first variable]", `0`, NULL);
549	break;
550
551	case _CS_RESET_LAST_VARIABLE:
552	rc= tailoring_append(st, "[last variable]", `0`, NULL);
553	break;
554
555	default:
556	rc=MY_XML_OK;
557	}
558	return rc;
559	}
560
561
562	static const char *diff_fmt[`5`]=
563	{
564	"<%.*s",
565	"<<%.*s",
566	"<<<%.*s",
567	"<<<<%.*s",
568	"=%.*s"
569	};
570
571
572	static const char *context_diff_fmt[`5`]=
573	{
574	"<%.s\|%.s",
575	"<<%.s\|%.s",
576	"<<<%.s\|%.s",
577	"<<<<%.s\|%.s",
578	"=%.s\|%.s"
579	};
580
581
582	static int cs_value(MY_XML_PARSER st,const* char *attr, size_t len)
583	{
584	struct my_cs_file_info i= (struct* my_cs_file_info *)st->user_data;
585	const struct my_cs_file_section_st *s;
586	int state= (int)((s= cs_file_sec(st->attr.start,
587	st->attr.end - st->attr.start)) ?
588	s->state : `0`);
589	int rc= MY_XML_OK;
590
591	switch (state) {
592	case _CS_MISC:
593	case _CS_FAMILY:
594	case _CS_ORDER:
595	break;
596	case _CS_ID:
597	i->cs.number= strtol(attr,(char**)NULL,`10`);
598	break;
599	case _CS_BINARY_ID:
600	i->cs.binary_number= strtol(attr,(char**)NULL,`10`);
601	break;
602	case _CS_PRIMARY_ID:
603	i->cs.primary_number= strtol(attr,(char**)NULL,`10`);
604	break;
605	case _CS_COLNAME:
606	i->cs.name=mstr(i->name,attr,len,MY_CS_NAME_SIZE-`1`);
607	break;
608	case _CS_CSNAME:
609	i->cs.csname=mstr(i->csname,attr,len,MY_CS_NAME_SIZE-`1`);
610	break;
611	case _CS_CSDESCRIPT:
612	i->cs.comment=mstr(i->comment,attr,len,MY_CS_CSDESCR_SIZE-`1`);
613	break;
614	case _CS_FLAG:
615	if (!strncmp("primary",attr,len))
616	i->cs.state\|= MY_CS_PRIMARY;
617	else if (!strncmp("binary",attr,len))
618	i->cs.state\|= MY_CS_BINSORT;
619	else if (!strncmp("compiled",attr,len))
620	i->cs.state\|= MY_CS_COMPILED;
621	else if (!strncmp("nopad",attr,len))
622	i->cs.state\|= MY_CS_NOPAD;
623	break;
624	case _CS_UPPERMAP:
625	fill_uchar(i->to_upper,MY_CS_TO_UPPER_TABLE_SIZE,attr,len);
626	i->cs.to_upper=i->to_upper;
627	break;
628	case _CS_LOWERMAP:
629	fill_uchar(i->to_lower,MY_CS_TO_LOWER_TABLE_SIZE,attr,len);
630	i->cs.to_lower=i->to_lower;
631	break;
632	case _CS_UNIMAP:
633	fill_uint16(i->tab_to_uni,MY_CS_TO_UNI_TABLE_SIZE,attr,len);
634	i->cs.tab_to_uni=i->tab_to_uni;
635	break;
636	case _CS_COLLMAP:
637	fill_uchar(i->sort_order,MY_CS_SORT_ORDER_TABLE_SIZE,attr,len);
638	i->cs.sort_order=i->sort_order;
639	break;
640	case _CS_CTYPEMAP:
641	fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len);
642	i->cs.ctype=i->ctype;
643	break;
644
645	/ Special purpose commands /
646	case _CS_UCA_VERSION:
647	rc= tailoring_append(st, "[version %.*s]", len, attr);
648	break;
649
650	case _CS_CL_RULES_IMPORT_SOURCE:
651	rc= tailoring_append(st, "[import %.*s]", len, attr);
652	break;
653
654	case _CS_CL_SUPPRESS_CONTRACTIONS:
655	rc= tailoring_append(st, "[suppress contractions %.*s]", len, attr);
656	break;
657
658	case _CS_CL_OPTIMIZE:
659	rc= tailoring_append(st, "[optimize %.*s]", len, attr);
660	break;
661
662	case _CS_CL_SHIFT_AFTER_METHOD:
663	rc= tailoring_append(st, "[shift-after-method %.*s]", len, attr);
664	break;
665
666	/ Collation Settings /
667	case _CS_ST_STRENGTH:
668	/ 1, 2, 3, 4, 5, or primary, secondary, tertiary, quaternary, identical /
669	rc= tailoring_append(st, "[strength %.*s]", len, attr);
670	if (len && attr[`0`] >= `'1'` && attr[`0`] <= `'9'`)
671	i->cs.levels_for_order= attr[`0`] - `'0'`;
672	break;
673
674	case _CS_ST_ALTERNATE:
675	/ non-ignorable, shifted /
676	rc= tailoring_append(st, "[alternate %.*s]", len, attr);
677	break;
678
679	case _CS_ST_BACKWARDS:
680	/ on, off, 2 /
681	rc= tailoring_append(st, "[backwards %.*s]", len, attr);
682	break;
683
684	case _CS_ST_NORMALIZATION:
685	/*
686	TODO for WL#896: check collations for normalization: vi.xml
687	We want precomposed characters work well at this point.
688	*/
689	/ on, off /
690	rc= tailoring_append(st, "[normalization %.*s]", len, attr);
691	break;
692
693	case _CS_ST_CASE_LEVEL:
694	/ on, off /
695	rc= tailoring_append(st, "[caseLevel %.*s]", len, attr);
696	break;
697
698	case _CS_ST_CASE_FIRST:
699	/ upper, lower, off /
700	rc= tailoring_append(st, "[caseFirst %.*s]", len, attr);
701	break;
702
703	case _CS_ST_HIRAGANA_QUATERNARY:
704	/ on, off /
705	rc= tailoring_append(st, "[hiraganaQ %.*s]", len, attr);
706	break;
707
708	case _CS_ST_NUMERIC:
709	/ on, off /
710	rc= tailoring_append(st, "[numeric %.*s]", len, attr);
711	break;
712
713	case _CS_ST_VARIABLE_TOP:
714	/ TODO for WL#896: check value format /
715	rc= tailoring_append(st, "[variableTop %.*s]", len, attr);
716	break;
717
718	case _CS_ST_MATCH_BOUNDARIES:
719	/ none, whole-character, whole-word /
720	rc= tailoring_append(st, "[match-boundaries %.*s]", len, attr);
721	break;
722
723	case _CS_ST_MATCH_STYLE:
724	/ minimal, medial, maximal /
725	rc= tailoring_append(st, "[match-style %.*s]", len, attr);
726	break;
727
728
729	/ Rules /
730	case _CS_RESET:
731	rc= tailoring_append(st, "%.*s", len, attr);
732	break;
733
734	case _CS_DIFF1:
735	case _CS_DIFF2:
736	case _CS_DIFF3:
737	case _CS_DIFF4:
738	case _CS_IDENTICAL:
739	rc= tailoring_append(st, diff_fmt[state - _CS_DIFF1], len, attr);
740	break;
741
742
743	/ Rules: Expansion /
744	case _CS_EXP_EXTEND:
745	rc= tailoring_append(st, " / %.*s", len, attr);
746	break;
747
748	case _CS_EXP_DIFF1:
749	case _CS_EXP_DIFF2:
750	case _CS_EXP_DIFF3:
751	case _CS_EXP_DIFF4:
752	case _CS_EXP_IDENTICAL:
753	if (i->context[`0`])
754	{
755	rc= tailoring_append2(st, context_diff_fmt[state - _CS_EXP_DIFF1],
756	strlen(i->context), i->context, len, attr);
757	i->context[`0`]= `0`;
758	}
759	else
760	rc= tailoring_append(st, diff_fmt[state - _CS_EXP_DIFF1], len, attr);
761	break;
762
763	/ Rules: Context /
764	case _CS_CONTEXT:
765	if (len < sizeof(i->context))
766	{
767	memcpy(i->context, attr, len);
768	i->context[len]= `'\0'`;
769	}
770	break;
771
772	/ Rules: Abbreviating Ordering Specifications /
773	case _CS_A_DIFF1:
774	case _CS_A_DIFF2:
775	case _CS_A_DIFF3:
776	case _CS_A_DIFF4:
777	case _CS_A_IDENTICAL:
778	rc= tailoring_append_abbreviation(st, diff_fmt[state - _CS_A_DIFF1], len, attr);
779	break;
780
781	/ Rules: Placing Characters Before Others /
782	case _CS_RESET_BEFORE:
783	/*
784	TODO for WL#896: Add this check into text customization parser:
785	It is an error if the strength of the before relation is not identical
786	to the relation after the reset. We'll need this for WL#896.
787	*/
788	rc= tailoring_append(st, "[before %.*s]", len, attr);
789	break;
790
791
792	default:
793	break;
794	}
795
796	return rc;
797	}
798
799
800	my_bool
801	my_parse_charset_xml(MY_CHARSET_LOADER loader, const* char *buf, size_t len)
802	{
803	MY_XML_PARSER p;
804	struct my_cs_file_info info;
805	my_bool rc;
806
807	my_charset_file_init(&info);
808	my_xml_parser_create(&p);
809	my_xml_set_enter_handler(&p,cs_enter);
810	my_xml_set_value_handler(&p,cs_value);
811	my_xml_set_leave_handler(&p,cs_leave);
812	info.loader= loader;
813	my_xml_set_user_data(&p, (void *) &info);
814	rc= (my_xml_parse(&p,buf,len) == MY_XML_OK) ? FALSE : TRUE;
815	my_xml_parser_free(&p);
816	my_charset_file_free(&info);
817	if (rc != MY_XML_OK)
818	{
819	const char *errstr= my_xml_error_string(&p);
820	if (sizeof(loader->error) > `32` + strlen(errstr))
821	{
822	/ We cannot use my_snprintf() here. See previous comment. /
823	sprintf(loader->error, "at line %d pos %d: %s",
824	my_xml_error_lineno(&p)+`1`,
825	(int) my_xml_error_pos(&p),
826	my_xml_error_string(&p));
827	}
828	}
829	return rc;
830	}
831
832
833	uint
834	my_string_repertoire_8bit(CHARSET_INFO cs, const* char *str, size_t length)
835	{
836	const char *strend;
837	if ((cs->state & MY_CS_NONASCII) && length > `0`)
838	return MY_REPERTOIRE_UNICODE30;
839	for (strend= str + length; str < strend; str++)
840	{
841	if (((uchar) *str) > `0x7F`)
842	return MY_REPERTOIRE_UNICODE30;
843	}
844	return MY_REPERTOIRE_ASCII;
845	}
846
847
848	static void
849	my_string_metadata_init(MY_STRING_METADATA *metadata)
850	{
851	metadata->repertoire= MY_REPERTOIRE_ASCII;
852	metadata->char_length= `0`;
853	}
854
855
856	/**
857	This should probably eventually go as a virtual function into
858	MY_CHARSET_HANDLER or MY_COLLATION_HANDLER.
859	*/
860	static void
861	my_string_metadata_get_mb(MY_STRING_METADATA *metadata,
862	CHARSET_INFO cs, const* char *str, ulong length)
863	{
864	const char *strend= str + length;
865	for (my_string_metadata_init(metadata) ;
866	str < strend;
867	metadata->char_length++)
868	{
869	my_wc_t wc;
870	int mblen= cs->cset->mb_wc(cs, &wc, (const uchar *) str,
871	(const uchar *) strend);
872	if (mblen > `0`) / Assigned character /
873	{
874	if (wc > `0x7F`)
875	metadata->repertoire\|= MY_REPERTOIRE_EXTENDED;
876	str+= mblen;
877	}
878	else if (mblen == MY_CS_ILSEQ) / Bad byte sequence /
879	{
880	metadata->repertoire\|= MY_REPERTOIRE_EXTENDED;
881	str++;
882	}
883	else if (mblen > MY_CS_TOOSMALL) / Unassigned character /
884	{
885	metadata->repertoire\|= MY_REPERTOIRE_EXTENDED;
886	str+= (-mblen);
887	}
888	else / Incomplete character, premature end-of-line /
889	{
890	metadata->repertoire\|= MY_REPERTOIRE_EXTENDED; / Just in case /
891	break;
892	}
893	}
894	}
895
896
897	/**
898	Collect string metadata: length in characters and repertoire.
899	*/
900	void
901	my_string_metadata_get(MY_STRING_METADATA *metadata,
902	CHARSET_INFO cs, const* char *str, size_t length)
903	{
904	if (cs->mbmaxlen == `1` && !(cs->state & MY_CS_NONASCII))
905	{
906	metadata->char_length= length;
907	metadata->repertoire= my_string_repertoire_8bit(cs, str, (ulong)length);
908	}
909	else
910	{
911	my_string_metadata_get_mb(metadata, cs, str, (ulong)length);
912	}
913	}
914
915
916	/*
917	Check repertoire: detect pure ascii strings
918	*/
919	uint
920	my_string_repertoire(CHARSET_INFO cs, const* char *str, size_t length)
921	{
922	if (cs->mbminlen == `1` && !(cs->state & MY_CS_NONASCII))
923	{
924	return my_string_repertoire_8bit(cs, str, length);
925	}
926	else
927	{
928	const char *strend= str + length;
929	my_wc_t wc;
930	int chlen;
931	for (;
932	(chlen= cs->cset->mb_wc(cs, &wc, (uchar) str, (uchar) strend)) > `0`;
933	str+= chlen)
934	{
935	if (wc > `0x7F`)
936	return MY_REPERTOIRE_UNICODE30;
937	}
938	}
939	return MY_REPERTOIRE_ASCII;
940	}
941
942
943	/*
944	Returns repertoire for charset
945	*/
946	uint my_charset_repertoire(CHARSET_INFO *cs)
947	{
948	return cs->state & MY_CS_PUREASCII ?
949	MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
950	}
951
952
953	/*
954	Detect whether a character set is ASCII compatible.
955
956	Returns TRUE for:
957
958	- all 8bit character sets whose Unicode mapping of 0x7B is '{'
959	(ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS")
960
961	- all multi-byte character sets having mbminlen == 1
962	(ignores ucs2 whose mbminlen is 2)
963
964	TODO:
965
966	When merging to 5.2, this function should be changed
967	to check a new flag MY_CS_NONASCII,
968
969	return (cs->flag & MY_CS_NONASCII) ? 0 : 1;
970
971	This flag was previously added into 5.2 under terms
972	of WL#3759 "Optimize identifier conversion in client-server protocol"
973	especially to mark character sets not compatible with ASCII.
974
975	We won't backport this flag to 5.0 or 5.1.
976	This function is Ok for 5.0 and 5.1, because we're not going
977	to introduce new tricky character sets between 5.0 and 5.2.
978	*/
979	my_bool
980	my_charset_is_ascii_based(CHARSET_INFO *cs)
981	{
982	return
983	(cs->mbmaxlen == `1` && cs->tab_to_uni && cs->tab_to_uni[`'{'`] == `'{'`) \|\|
984	(cs->mbminlen == `1` && cs->mbmaxlen > `1`);
985	}
986
987
988	/*
989	Convert a string between two character sets.
990	'to' must be large enough to store (form_length to_cs->mbmaxlen) bytes.*
991
992	@param to[OUT] Store result here
993	@param to_length Size of "to" buffer
994	@param to_cs Character set of result string
995	@param from Copy from here
996	@param from_length Length of the "from" string
997	@param from_cs Character set of the "from" string
998	@param errors[OUT] Number of conversion errors
999
1000	@return Number of bytes copied to 'to' string
1001	*/
1002
1003	uint32
1004	my_convert_using_func(char *to, size_t to_length,
1005	CHARSET_INFO *to_cs, my_charset_conv_wc_mb wc_mb,
1006	const char *from, size_t from_length,
1007	CHARSET_INFO *from_cs, my_charset_conv_mb_wc mb_wc,
1008	uint *errors)
1009	{
1010	int cnvres;
1011	my_wc_t wc;
1012	const uchar from_end= (const* uchar*) from + from_length;
1013	char *to_start= to;
1014	uchar to_end= (uchar) to + to_length;
1015	uint error_count= `0`;
1016
1017	while (`1`)
1018	{
1019	if ((cnvres= (mb_wc)(from_cs, &wc, (uchar) from, from_end)) > `0`)
1020	from+= cnvres;
1021	else if (cnvres == MY_CS_ILSEQ)
1022	{
1023	error_count++;
1024	from++;
1025	wc= `'?'`;
1026	}
1027	else if (cnvres > MY_CS_TOOSMALL)
1028	{
1029	/*
1030	A correct multibyte sequence detected
1031	But it doesn't have Unicode mapping.
1032	*/
1033	error_count++;
1034	from+= (-cnvres);
1035	wc= `'?'`;
1036	}
1037	else
1038	{
1039	if ((uchar *) from >= from_end)
1040	break; / End of line /
1041	/ Incomplete byte sequence /
1042	error_count++;
1043	from++;
1044	wc= `'?'`;
1045	}
1046
1047	outp:
1048	if ((cnvres= (wc_mb)(to_cs, wc, (uchar) to, to_end)) > `0`)
1049	to+= cnvres;
1050	else if (cnvres == MY_CS_ILUNI && wc != `'?'`)
1051	{
1052	error_count++;
1053	wc= `'?'`;
1054	goto outp;
1055	}
1056	else
1057	break;
1058	}
1059	*errors= error_count;
1060	return (uint32) (to - to_start);
1061	}
1062
1063
1064	/*
1065	Convert a string between two character sets.
1066	Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
1067	'to' must be large enough to store (form_length to_cs->mbmaxlen) bytes.*
1068
1069	@param to[OUT] Store result here
1070	@param to_length Size of "to" buffer
1071	@param to_cs Character set of result string
1072	@param from Copy from here
1073	@param from_length Length of the "from" string
1074	@param from_cs Character set of the "from" string
1075	@param errors[OUT] Number of conversion errors
1076
1077	@return Number of bytes copied to 'to' string
1078	*/
1079
1080	uint32
1081	my_convert(char to, uint32 to_length, CHARSET_INFO to_cs,
1082	const char *from, uint32 from_length,
1083	CHARSET_INFO from_cs, uint errors)
1084	{
1085	uint32 length, length2;
1086	/*
1087	If any of the character sets is not ASCII compatible,
1088	immediately switch to slow mb_wc->wc_mb method.
1089	*/
1090	if ((to_cs->state \| from_cs->state) & MY_CS_NONASCII)
1091	return my_convert_using_func(to, to_length,
1092	to_cs, to_cs->cset->wc_mb,
1093	from, from_length,
1094	from_cs, from_cs->cset->mb_wc,
1095	errors);
1096
1097	length= length2= MY_MIN(to_length, from_length);
1098
1099	#if defined(__i386__) \|\| defined(__x86_64__)
1100	/*
1101	Special loop for i386, it allows to refer to a
1102	non-aligned memory block as UINT32, which makes
1103	it possible to copy four bytes at once. This
1104	gives about 10% performance improvement comparing
1105	to byte-by-byte loop.
1106	*/
1107	for ( ; length >= `4`; length-= `4`, from+= `4`, to+= `4`)
1108	{
1109	if (((uint32)from) & `0x80808080`)
1110	break;
1111	((uint32) to)= ((const* uint32*) from);
1112	}
1113	#endif /* __i386__ */
1114
1115	for (; ; to++= from++, length--)
1116	{
1117	if (!length)
1118	{
1119	*errors= `0`;
1120	return length2;
1121	}
1122	if (((unsigned* char) from) > `0x7F`) /* A non-ASCII character /
1123	{
1124	uint32 copied_length= length2 - length;
1125	to_length-= copied_length;
1126	from_length-= copied_length;
1127	return copied_length + my_convert_using_func(to, to_length, to_cs,
1128	to_cs->cset->wc_mb,
1129	from, from_length, from_cs,
1130	from_cs->cset->mb_wc,
1131	errors);
1132	}
1133	}
1134
1135	DBUG_ASSERT(FALSE); // Should never get to here
1136	return `0`; // Make compiler happy
1137	}
1138
1139
1140	size_t
1141	my_convert_fix(CHARSET_INFO to_cs, char* *to, size_t to_length,
1142	CHARSET_INFO from_cs, const* char *from, size_t from_length,
1143	size_t nchars,
1144	MY_STRCOPY_STATUS *copy_status,
1145	MY_STRCONV_STATUS *conv_status)
1146	{
1147	int cnvres;
1148	my_wc_t wc;
1149	my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
1150	my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
1151	const uchar from_end= (const* uchar*) from + from_length;
1152	uchar to_end= (uchar) to + to_length;
1153	char *to_start= to;
1154
1155	DBUG_ASSERT(to_cs != &my_charset_bin);
1156	DBUG_ASSERT(from_cs != &my_charset_bin);
1157
1158	copy_status->m_well_formed_error_pos= NULL;
1159	conv_status->m_cannot_convert_error_pos= NULL;
1160
1161	for ( ; nchars; nchars--)
1162	{
1163	const char *from_prev= from;
1164	if ((cnvres= (mb_wc)(from_cs, &wc, (uchar) from, from_end)) > `0`)
1165	from+= cnvres;
1166	else if (cnvres == MY_CS_ILSEQ)
1167	{
1168	if (!copy_status->m_well_formed_error_pos)
1169	copy_status->m_well_formed_error_pos= from;
1170	from++;
1171	wc= `'?'`;
1172	}
1173	else if (cnvres > MY_CS_TOOSMALL)
1174	{
1175	/*
1176	A correct multibyte sequence detected
1177	But it doesn't have Unicode mapping.
1178	*/
1179	if (!conv_status->m_cannot_convert_error_pos)
1180	conv_status->m_cannot_convert_error_pos= from;
1181	from+= (-cnvres);
1182	wc= `'?'`;
1183	}
1184	else
1185	{
1186	if ((uchar *) from >= from_end)
1187	break; // End of line
1188	// Incomplete byte sequence
1189	if (!copy_status->m_well_formed_error_pos)
1190	copy_status->m_well_formed_error_pos= from;
1191	from++;
1192	wc= `'?'`;
1193	}
1194	outp:
1195	if ((cnvres= (wc_mb)(to_cs, wc, (uchar) to, to_end)) > `0`)
1196	to+= cnvres;
1197	else if (cnvres == MY_CS_ILUNI && wc != `'?'`)
1198	{
1199	if (!conv_status->m_cannot_convert_error_pos)
1200	conv_status->m_cannot_convert_error_pos= from_prev;
1201	wc= `'?'`;
1202	goto outp;
1203	}
1204	else
1205	{
1206	from= from_prev;
1207	break;
1208	}
1209	}
1210	copy_status->m_source_end_pos= from;
1211	return to - to_start;
1212	}
1213

Browse the source code of MariaDB/strings/ctype.c