1 | /* Copyright (c) 2000-2003, 2005-2007 MySQL AB, 2009 Sun Microsystems, Inc. |
2 | Copyright (c) 2009-2011, Monty Program Ab |
3 | Use is subject to license terms. |
4 | Copyright (c) 2009-2011, Monty Program Ab |
5 | |
6 | This program is free software; you can redistribute it and/or modify |
7 | it under the terms of the GNU General Public License as published by |
8 | the Free Software Foundation; version 2 of the License. |
9 | |
10 | This program is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | GNU General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU General Public License |
16 | along with this program; if not, write to the Free Software |
17 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ |
18 | |
19 | #include "strings_def.h" |
20 | #include <m_ctype.h> |
21 | #include <fcntl.h> |
22 | #include <my_xml.h> |
23 | |
24 | #define ROW_LEN 16 |
25 | #define ROW16_LEN 8 |
26 | #define MAX_BUF (64*1024) |
27 | |
28 | |
29 | #define MY_ALL_CHARSETS_SIZE 2048 |
30 | |
31 | static struct charset_info_st all_charsets[MY_ALL_CHARSETS_SIZE]; |
32 | static uint refids[MY_ALL_CHARSETS_SIZE]; |
33 | |
34 | static CHARSET_INFO *inheritance_source(uint id) |
35 | { |
36 | return &all_charsets[refids[id]]; |
37 | } |
38 | |
39 | |
40 | void |
41 | print_array(FILE *f, const char *set, const char *name, const uchar *a, int n) |
42 | { |
43 | int i; |
44 | |
45 | fprintf(f,"static const uchar %s_%s[] = {\n" , name, set); |
46 | |
47 | for (i=0 ;i<n ; i++) |
48 | { |
49 | fprintf(f,"0x%02X" ,a[i]); |
50 | fprintf(f, (i+1<n) ? "," :"" ); |
51 | fprintf(f, ((i+1) % ROW_LEN == n % ROW_LEN) ? "\n" : "" ); |
52 | } |
53 | fprintf(f,"};\n\n" ); |
54 | } |
55 | |
56 | |
57 | void |
58 | print_array16(FILE *f, const char *set, const char *name, const uint16 *a, int n) |
59 | { |
60 | int i; |
61 | |
62 | fprintf(f,"static const uint16 %s_%s[] = {\n" , name, set); |
63 | |
64 | for (i=0 ;i<n ; i++) |
65 | { |
66 | fprintf(f,"0x%04X" ,a[i]); |
67 | fprintf(f, (i+1<n) ? "," :"" ); |
68 | fprintf(f, ((i+1) % ROW16_LEN == n % ROW16_LEN) ? "\n" : "" ); |
69 | } |
70 | fprintf(f,"};\n\n" ); |
71 | } |
72 | |
73 | |
74 | static uint get_collation_number(const char *name) |
75 | { |
76 | CHARSET_INFO *cs; |
77 | for (cs= all_charsets; |
78 | cs < all_charsets + array_elements(all_charsets); |
79 | cs++) |
80 | { |
81 | if (cs->name && !strcmp(cs->name, name)) |
82 | return cs->number; |
83 | } |
84 | return 0; |
85 | } |
86 | |
87 | |
88 | static uint |
89 | get_charset_number_internal(const char *charset_name, uint cs_flags) |
90 | { |
91 | CHARSET_INFO *cs; |
92 | for (cs= all_charsets; |
93 | cs < all_charsets + array_elements(all_charsets); |
94 | cs++) |
95 | { |
96 | if (cs->csname && (cs->state & cs_flags) && |
97 | !strcmp(cs->csname, charset_name)) |
98 | return cs->number; |
99 | } |
100 | return 0; |
101 | } |
102 | |
103 | char *mdup(const char *src, uint len) |
104 | { |
105 | char *dst=(char*)malloc(len); |
106 | if (!dst) |
107 | exit(1); |
108 | memcpy(dst,src,len); |
109 | return dst; |
110 | } |
111 | |
112 | static void simple_cs_copy_data(struct charset_info_st *to, CHARSET_INFO *from) |
113 | { |
114 | to->number= from->number ? from->number : to->number; |
115 | to->state|= from->state; |
116 | |
117 | if (from->csname) |
118 | to->csname= strdup(from->csname); |
119 | |
120 | if (from->name) |
121 | to->name= strdup(from->name); |
122 | |
123 | if (from->tailoring) |
124 | to->tailoring= strdup(from->tailoring); |
125 | |
126 | if (from->ctype) |
127 | to->ctype= (uchar*) mdup((char*) from->ctype, MY_CS_CTYPE_TABLE_SIZE); |
128 | if (from->to_lower) |
129 | to->to_lower= (uchar*) mdup((char*) from->to_lower, MY_CS_TO_LOWER_TABLE_SIZE); |
130 | if (from->to_upper) |
131 | to->to_upper= (uchar*) mdup((char*) from->to_upper, MY_CS_TO_UPPER_TABLE_SIZE); |
132 | if (from->sort_order) |
133 | { |
134 | to->sort_order= (uchar*) mdup((char*) from->sort_order, MY_CS_SORT_ORDER_TABLE_SIZE); |
135 | /* |
136 | set_max_sort_char(to); |
137 | */ |
138 | } |
139 | if (from->tab_to_uni) |
140 | { |
141 | uint sz= MY_CS_TO_UNI_TABLE_SIZE*sizeof(uint16); |
142 | to->tab_to_uni= (uint16*) mdup((char*)from->tab_to_uni, sz); |
143 | /* |
144 | create_fromuni(to); |
145 | */ |
146 | } |
147 | } |
148 | |
149 | |
150 | /* |
151 | cs->xxx arrays can be NULL in case when a collation has an entry only |
152 | in Index.xml and has no entry in csname.xml (e.g. in case of a binary |
153 | collation or a collation using <import> command). |
154 | |
155 | refcs->xxx arrays can be NULL if <import> refers to a collation |
156 | which is not defined in csname.xml, e.g. an always compiled collation |
157 | such as latin1_swedish_ci. |
158 | */ |
159 | static void inherit_charset_data(struct charset_info_st *cs, |
160 | CHARSET_INFO *refcs) |
161 | { |
162 | cs->state|= (refcs->state & (MY_CS_PUREASCII|MY_CS_NONASCII)); |
163 | if (refcs->ctype && cs->ctype && |
164 | !memcmp(cs->ctype, refcs->ctype, MY_CS_CTYPE_TABLE_SIZE)) |
165 | cs->ctype= NULL; |
166 | if (refcs->to_lower && cs->to_lower && |
167 | !memcmp(cs->to_lower, refcs->to_lower, MY_CS_TO_LOWER_TABLE_SIZE)) |
168 | cs->to_lower= NULL; |
169 | if (refcs->to_upper && cs->to_upper && |
170 | !memcmp(cs->to_upper, refcs->to_upper, MY_CS_TO_LOWER_TABLE_SIZE)) |
171 | cs->to_upper= NULL; |
172 | if (refcs->tab_to_uni && cs->tab_to_uni && |
173 | !memcmp(cs->tab_to_uni, refcs->tab_to_uni, |
174 | MY_CS_TO_UNI_TABLE_SIZE * sizeof(uint16))) |
175 | cs->tab_to_uni= NULL; |
176 | } |
177 | |
178 | |
179 | static CHARSET_INFO *find_charset_data_inheritance_source(CHARSET_INFO *cs) |
180 | { |
181 | CHARSET_INFO *refcs; |
182 | uint refid= get_charset_number_internal(cs->csname, MY_CS_PRIMARY); |
183 | return refid && refid != cs->number && |
184 | (refcs= &all_charsets[refid]) && |
185 | (refcs->state & MY_CS_LOADED) ? refcs : NULL; |
186 | } |
187 | |
188 | |
189 | /** |
190 | Detect if "cs" needs further loading from csname.xml |
191 | @param cs - the character set pointer |
192 | @retval FALSE - if the current data (e.g. loaded from from Index.xml) |
193 | is not enough to dump the character set and requires |
194 | further reading from the csname.xml file. |
195 | @retval TRUE - if the current data is enough to dump, |
196 | no reading of csname.xml is needed. |
197 | */ |
198 | static my_bool simple_cs_is_full(CHARSET_INFO *cs) |
199 | { |
200 | return ((cs->csname && cs->tab_to_uni && cs->ctype && cs->to_upper && |
201 | cs->to_lower) && |
202 | (cs->number && cs->name && |
203 | (cs->sort_order || cs->tailoring || (cs->state & MY_CS_BINSORT)))); |
204 | } |
205 | |
206 | static int add_collation(struct charset_info_st *cs) |
207 | { |
208 | if (cs->name && |
209 | (cs->number || (cs->number= get_collation_number(cs->name)))) |
210 | { |
211 | if (!(all_charsets[cs->number].state & MY_CS_COMPILED)) |
212 | { |
213 | simple_cs_copy_data(&all_charsets[cs->number],cs); |
214 | |
215 | } |
216 | |
217 | cs->number= 0; |
218 | cs->name= NULL; |
219 | cs->tailoring= NULL; |
220 | cs->state= 0; |
221 | cs->sort_order= NULL; |
222 | cs->state= 0; |
223 | } |
224 | return MY_XML_OK; |
225 | } |
226 | |
227 | |
228 | static void |
229 | default_reporter(enum loglevel level __attribute__ ((unused)), |
230 | const char *format __attribute__ ((unused)), |
231 | ...) |
232 | { |
233 | } |
234 | |
235 | |
236 | static void |
237 | my_charset_loader_init(MY_CHARSET_LOADER *loader) |
238 | { |
239 | loader->error[0]= '\0'; |
240 | loader->once_alloc= malloc; |
241 | loader->malloc= malloc; |
242 | loader->realloc= realloc; |
243 | loader->free= free; |
244 | loader->reporter= default_reporter; |
245 | loader->add_collation= add_collation; |
246 | } |
247 | |
248 | |
249 | static int my_read_charset_file(const char *filename) |
250 | { |
251 | char buf[MAX_BUF]; |
252 | int fd; |
253 | uint len; |
254 | MY_CHARSET_LOADER loader; |
255 | |
256 | my_charset_loader_init(&loader); |
257 | if ((fd=open(filename,O_RDONLY)) < 0) |
258 | { |
259 | fprintf(stderr,"Can't open '%s'\n" ,filename); |
260 | return 1; |
261 | } |
262 | |
263 | len=read(fd,buf,MAX_BUF); |
264 | DBUG_ASSERT(len < MAX_BUF); |
265 | close(fd); |
266 | |
267 | if (my_parse_charset_xml(&loader, buf, len)) |
268 | { |
269 | fprintf(stderr, "Error while parsing '%s': %s\n" , filename, loader.error); |
270 | exit(1); |
271 | } |
272 | |
273 | return FALSE; |
274 | } |
275 | |
276 | |
277 | void print_arrays(FILE *f, CHARSET_INFO *cs) |
278 | { |
279 | if (cs->ctype) |
280 | print_array(f, cs->name, "ctype" , cs->ctype, MY_CS_CTYPE_TABLE_SIZE); |
281 | if (cs->to_lower) |
282 | print_array(f, cs->name, "to_lower" , cs->to_lower, MY_CS_TO_LOWER_TABLE_SIZE); |
283 | if (cs->to_upper) |
284 | print_array(f, cs->name, "to_upper" , cs->to_upper, MY_CS_TO_UPPER_TABLE_SIZE); |
285 | if (cs->sort_order) |
286 | print_array(f, cs->name, "sort_order" , cs->sort_order, MY_CS_SORT_ORDER_TABLE_SIZE); |
287 | if (cs->tab_to_uni) |
288 | print_array16(f, cs->name, "to_uni" , cs->tab_to_uni, MY_CS_TO_UNI_TABLE_SIZE); |
289 | } |
290 | |
291 | |
292 | /** |
293 | Print an array member of a CHARSET_INFO. |
294 | @param f - the file to print into |
295 | @param cs0 - reference to the CHARSET_INFO to print |
296 | @param array0 - pointer to the array data (can be NULL) |
297 | @param cs1 - reference to the CHARSET_INFO that the data |
298 | can be inherited from (e.g. primary collation) |
299 | @param array1 - pointer to the array data in cs1 (can be NULL) |
300 | @param name - name of the member |
301 | |
302 | If array0 is not null, then the CHARSET_INFO being dumped has its |
303 | own array (e.g. the default collation for the character set). |
304 | We print the name of this array using cs0->name and return. |
305 | |
306 | If array1 is not null, then the CHARSET_INFO being dumpled reuses |
307 | the array from another collation. We print the name of the array of |
308 | the referenced collation using cs1->name and return. |
309 | |
310 | Otherwise (if both array0 and array1 are NULL), we have a collation |
311 | of a character set whose primary collation is not available now, |
312 | and which does not have its own entry in csname.xml file. |
313 | |
314 | For example, Index.xml has this entry: |
315 | <collation name="latin1_swedish_ci_copy"> |
316 | <rules> |
317 | <import source="latin1_swedish_ci"/> |
318 | </rules> |
319 | </collation> |
320 | and latin1.xml does not have entries for latin1_swedish_ci_copy. |
321 | |
322 | In such cases we print NULL as a pointer to the array. |
323 | It will be set to a not-null data during the first initialization |
324 | by the inherit_charset_data() call (see mysys/charset.c for details). |
325 | */ |
326 | static void |
327 | print_array_ref(FILE *f, |
328 | CHARSET_INFO *cs0, const void *array0, |
329 | CHARSET_INFO *cs1, const void *array1, |
330 | const char *name) |
331 | { |
332 | CHARSET_INFO *cs= array0 ? cs0 : array1 ? cs1 : NULL; |
333 | if (cs) |
334 | fprintf(f," %s_%s, /* %s */\n" , |
335 | name, cs->name, name); |
336 | else |
337 | fprintf(f," NULL, /* %s */\n" , name); |
338 | } |
339 | |
340 | |
341 | static const char *nopad_infix(CHARSET_INFO *cs) |
342 | { |
343 | return (cs->state & MY_CS_NOPAD) ? "_nopad" : "" ; |
344 | } |
345 | |
346 | |
347 | void dispcset(FILE *f,CHARSET_INFO *cs) |
348 | { |
349 | fprintf(f,"{\n" ); |
350 | fprintf(f," %d,%d,%d,\n" ,cs->number,0,0); |
351 | fprintf(f," MY_CS_COMPILED%s%s%s%s%s%s,\n" , |
352 | cs->state & MY_CS_BINSORT ? "|MY_CS_BINSORT" : "" , |
353 | cs->state & MY_CS_PRIMARY ? "|MY_CS_PRIMARY" : "" , |
354 | cs->state & MY_CS_CSSORT ? "|MY_CS_CSSORT" : "" , |
355 | cs->state & MY_CS_PUREASCII ? "|MY_CS_PUREASCII" : "" , |
356 | cs->state & MY_CS_NONASCII ? "|MY_CS_NONASCII" : "" , |
357 | cs->state & MY_CS_NOPAD ? "|MY_CS_NOPAD" : "" ); |
358 | |
359 | if (cs->name) |
360 | { |
361 | CHARSET_INFO *srccs= inheritance_source(cs->number); |
362 | fprintf(f," \"%s\", /* cset name */\n" ,cs->csname); |
363 | fprintf(f," \"%s\", /* coll name */\n" ,cs->name); |
364 | fprintf(f," \"\", /* comment */\n" ); |
365 | if (cs->tailoring) |
366 | fprintf(f, " \"%s\", /* tailoring */\n" , cs->tailoring); |
367 | else |
368 | fprintf(f," NULL, /* tailoring */\n" ); |
369 | |
370 | print_array_ref(f, cs, cs->ctype, srccs, srccs->ctype, "ctype" ); |
371 | print_array_ref(f, cs, cs->to_lower, srccs, srccs->to_lower, "to_lower" ); |
372 | print_array_ref(f, cs, cs->to_upper, srccs, srccs->to_upper, "to_upper" ); |
373 | |
374 | if (cs->sort_order) |
375 | fprintf(f," sort_order_%s, /* sort_order */\n" ,cs->name); |
376 | else |
377 | fprintf(f," NULL, /* sort_order */\n" ); |
378 | |
379 | fprintf(f," NULL, /* uca */\n" ); |
380 | |
381 | print_array_ref(f, cs, cs->tab_to_uni, srccs, srccs->tab_to_uni, "to_uni" ); |
382 | } |
383 | else |
384 | { |
385 | fprintf(f," NULL, /* cset name */\n" ); |
386 | fprintf(f," NULL, /* coll name */\n" ); |
387 | fprintf(f," NULL, /* comment */\n" ); |
388 | fprintf(f," NULL, /* tailoging */\n" ); |
389 | fprintf(f," NULL, /* ctype */\n" ); |
390 | fprintf(f," NULL, /* lower */\n" ); |
391 | fprintf(f," NULL, /* upper */\n" ); |
392 | fprintf(f," NULL, /* sort order */\n" ); |
393 | fprintf(f," NULL, /* uca */\n" ); |
394 | fprintf(f," NULL, /* to_uni */\n" ); |
395 | } |
396 | |
397 | fprintf(f," NULL, /* from_uni */\n" ); |
398 | fprintf(f," &my_unicase_default, /* caseinfo */\n" ); |
399 | fprintf(f," NULL, /* state map */\n" ); |
400 | fprintf(f," NULL, /* ident map */\n" ); |
401 | fprintf(f," 1, /* strxfrm_multiply*/\n" ); |
402 | fprintf(f," 1, /* caseup_multiply*/\n" ); |
403 | fprintf(f," 1, /* casedn_multiply*/\n" ); |
404 | fprintf(f," 1, /* mbminlen */\n" ); |
405 | fprintf(f," 1, /* mbmaxlen */\n" ); |
406 | fprintf(f," 0, /* min_sort_char */\n" ); |
407 | fprintf(f," 255, /* max_sort_char */\n" ); |
408 | fprintf(f," ' ', /* pad_char */\n" ); |
409 | fprintf(f," 0, /* escape_with_backslash_is_dangerous */\n" ); |
410 | fprintf(f," 1, /* levels_for_order */\n" ); |
411 | fprintf(f," &my_charset_8bit_handler,\n" ); |
412 | |
413 | if (cs->state & MY_CS_BINSORT) |
414 | fprintf(f," &my_collation_8bit%s_bin_handler,\n" , nopad_infix(cs)); |
415 | else |
416 | fprintf(f," &my_collation_8bit_simple%s_ci_handler,\n" , nopad_infix(cs)); |
417 | fprintf(f,"}\n" ); |
418 | } |
419 | |
420 | |
421 | static void |
422 | fprint_copyright(FILE *file) |
423 | { |
424 | fprintf(file, |
425 | "/* Copyright 2000-2008 MySQL AB, 2008 Sun Microsystems, Inc.\n" |
426 | " Copyright (c) 2000, 2011, Oracle and/or its affiliates.\n" |
427 | " Copyright 2008-2016 MariaDB Corporation\n" |
428 | "\n" |
429 | " This program is free software; you can redistribute it and/or modify\n" |
430 | " it under the terms of the GNU General Public License as published by\n" |
431 | " the Free Software Foundation; version 2 of the License.\n" |
432 | "\n" |
433 | " This program is distributed in the hope that it will be useful,\n" |
434 | " but WITHOUT ANY WARRANTY; without even the implied warranty of\n" |
435 | " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n" |
436 | " GNU General Public License for more details.\n" |
437 | "\n" |
438 | " You should have received a copy of the GNU General Public License\n" |
439 | " along with this program; if not, write to the Free Software\n" |
440 | " Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */\n" |
441 | "\n" ); |
442 | } |
443 | |
444 | |
445 | int |
446 | main(int argc, char **argv __attribute__((unused))) |
447 | { |
448 | struct charset_info_st ncs, *cs; |
449 | char filename[256]; |
450 | FILE *f= stdout; |
451 | |
452 | if (argc < 2) |
453 | { |
454 | fprintf(stderr, "usage: %s source-dir\n" , argv[0]); |
455 | exit(EXIT_FAILURE); |
456 | } |
457 | |
458 | bzero((void*)&ncs,sizeof(ncs)); |
459 | bzero((void*)&all_charsets,sizeof(all_charsets)); |
460 | bzero((void*) refids, sizeof(refids)); |
461 | |
462 | sprintf(filename,"%s/%s" ,argv[1],"Index.xml" ); |
463 | my_read_charset_file(filename); |
464 | |
465 | for (cs= all_charsets; |
466 | cs < all_charsets + array_elements(all_charsets); |
467 | cs++) |
468 | { |
469 | if (cs->number && !(cs->state & MY_CS_COMPILED)) |
470 | { |
471 | if ( (!simple_cs_is_full(cs)) && (cs->csname)) |
472 | { |
473 | sprintf(filename,"%s/%s.xml" ,argv[1],cs->csname); |
474 | my_read_charset_file(filename); |
475 | } |
476 | cs->state|= MY_CS_LOADED; |
477 | } |
478 | } |
479 | |
480 | fprintf(f, "/*\n" ); |
481 | fprintf(f, " This file was generated by the conf_to_src utility. " |
482 | "Do not edit it directly,\n" ); |
483 | fprintf(f, " edit the XML definitions in sql/share/charsets/ instead.\n\n" ); |
484 | fprintf(f, " To re-generate, run the following in the strings/ " |
485 | "directory:\n" ); |
486 | fprintf(f, " ./conf_to_src ../sql/share/charsets/ > FILE\n" ); |
487 | fprintf(f, "*/\n\n" ); |
488 | fprint_copyright(f); |
489 | fprintf(f,"#include \"strings_def.h\"\n" ); |
490 | fprintf(f,"#include <m_ctype.h>\n\n" ); |
491 | |
492 | |
493 | for (cs= all_charsets; |
494 | cs < all_charsets + array_elements(all_charsets); |
495 | cs++) |
496 | { |
497 | if (cs->state & MY_CS_LOADED) |
498 | { |
499 | CHARSET_INFO *refcs= find_charset_data_inheritance_source(cs); |
500 | cs->state|= my_8bit_charset_flags_from_data(cs) | |
501 | my_8bit_collation_flags_from_data(cs); |
502 | if (refcs) |
503 | { |
504 | refids[cs->number]= refcs->number; |
505 | inherit_charset_data(cs, refcs); |
506 | } |
507 | fprintf(f,"#ifdef HAVE_CHARSET_%s\n" ,cs->csname); |
508 | print_arrays(f, cs); |
509 | fprintf(f,"#endif\n" ); |
510 | fprintf(f,"\n" ); |
511 | } |
512 | } |
513 | |
514 | fprintf(f,"struct charset_info_st compiled_charsets[] = {\n" ); |
515 | for (cs= all_charsets; |
516 | cs < all_charsets + array_elements(all_charsets); |
517 | cs++) |
518 | { |
519 | if (cs->state & MY_CS_LOADED) |
520 | { |
521 | fprintf(f,"#ifdef HAVE_CHARSET_%s\n" ,cs->csname); |
522 | dispcset(f,cs); |
523 | fprintf(f,",\n" ); |
524 | fprintf(f,"#endif\n" ); |
525 | } |
526 | } |
527 | |
528 | dispcset(f,&ncs); |
529 | fprintf(f,"};\n" ); |
530 | |
531 | return 0; |
532 | } |
533 | |