1/* Copyright (c) 2003, 2011, Oracle and/or its affiliates.
2 Copyright (c) 2011 Monty Program Ab
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */
16
17#include "strings_def.h"
18#include "m_string.h"
19#include "my_xml.h"
20#include "my_sys.h"
21
22
23#define MY_XML_UNKNOWN 'U'
24#define MY_XML_EOF 'E'
25#define MY_XML_STRING 'S'
26#define MY_XML_IDENT 'I'
27#define MY_XML_EQ '='
28#define MY_XML_LT '<'
29#define MY_XML_GT '>'
30#define MY_XML_SLASH '/'
31#define MY_XML_COMMENT 'C'
32#define MY_XML_TEXT 'T'
33#define MY_XML_QUESTION '?'
34#define MY_XML_EXCLAM '!'
35#define MY_XML_CDATA 'D'
36
37typedef struct xml_attr_st
38{
39 const char *beg;
40 const char *end;
41} MY_XML_ATTR;
42
43
44/*
45 XML ctype:
46*/
47#define MY_XML_ID0 0x01 /* Identifier initial character */
48#define MY_XML_ID1 0x02 /* Identifier medial character */
49#define MY_XML_SPC 0x08 /* Spacing character */
50
51
52/*
53 http://www.w3.org/TR/REC-xml/
54 [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' |
55 CombiningChar | Extender
56 [5] Name ::= (Letter | '_' | ':') (NameChar)*
57*/
58
59static char my_xml_ctype[256]=
60{
61/*00*/ 0,0,0,0,0,0,0,0,0,8,8,0,0,8,0,0,
62/*10*/ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
63/*20*/ 8,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0, /* !"#$%&'()*+,-./ */
64/*30*/ 2,2,2,2,2,2,2,2,2,2,3,0,0,0,0,0, /* 0123456789:;<=>? */
65/*40*/ 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* @ABCDEFGHIJKLMNO */
66/*50*/ 3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,3, /* PQRSTUVWXYZ[\]^_ */
67/*60*/ 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* `abcdefghijklmno */
68/*70*/ 3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0, /* pqrstuvwxyz{|}~ */
69/*80*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
70/*90*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
71/*A0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
72/*B0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
73/*C0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
74/*D0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
75/*E0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
76/*F0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
77};
78
79#define my_xml_is_space(c) (my_xml_ctype[(uchar) (c)] & MY_XML_SPC)
80#define my_xml_is_id0(c) (my_xml_ctype[(uchar) (c)] & MY_XML_ID0)
81#define my_xml_is_id1(c) (my_xml_ctype[(uchar) (c)] & MY_XML_ID1)
82
83
84static const char *lex2str(int lex)
85{
86 switch(lex)
87 {
88 case MY_XML_EOF: return "END-OF-INPUT";
89 case MY_XML_STRING: return "STRING";
90 case MY_XML_IDENT: return "IDENT";
91 case MY_XML_CDATA: return "CDATA";
92 case MY_XML_EQ: return "'='";
93 case MY_XML_LT: return "'<'";
94 case MY_XML_GT: return "'>'";
95 case MY_XML_SLASH: return "'/'";
96 case MY_XML_COMMENT: return "COMMENT";
97 case MY_XML_TEXT: return "TEXT";
98 case MY_XML_QUESTION: return "'?'";
99 case MY_XML_EXCLAM: return "'!'";
100 }
101 return "unknown token";
102}
103
104static void my_xml_norm_text(MY_XML_ATTR *a)
105{
106 for ( ; (a->beg < a->end) && my_xml_is_space(a->beg[0]) ; a->beg++ );
107 for ( ; (a->beg < a->end) && my_xml_is_space(a->end[-1]) ; a->end-- );
108}
109
110
111static inline my_bool
112my_xml_parser_prefix_cmp(MY_XML_PARSER *p, const char *s, size_t slen)
113{
114 return (p->cur + slen > p->end) || memcmp(p->cur, s, slen);
115}
116
117
118static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a)
119{
120 int lex;
121
122 for (; ( p->cur < p->end) && my_xml_is_space(p->cur[0]) ; p->cur++);
123
124 if (p->cur >= p->end)
125 {
126 a->beg=p->end;
127 a->end=p->end;
128 lex=MY_XML_EOF;
129 goto ret;
130 }
131
132 a->beg=p->cur;
133 a->end=p->cur;
134
135 if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("<!--")))
136 {
137 for (; p->cur < p->end; p->cur++)
138 {
139 if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("-->")))
140 {
141 p->cur+= 3;
142 break;
143 }
144 }
145 a->end=p->cur;
146 lex=MY_XML_COMMENT;
147 }
148 else if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("<![CDATA[")))
149 {
150 p->cur+= 9;
151 for (; p->cur < p->end - 2 ; p->cur++)
152 {
153 if (p->cur[0] == ']' && p->cur[1] == ']' && p->cur[2] == '>')
154 {
155 p->cur+= 3;
156 a->end= p->cur;
157 break;
158 }
159 }
160 lex= MY_XML_CDATA;
161 }
162 else if (strchr("?=/<>!",p->cur[0]))
163 {
164 p->cur++;
165 a->end=p->cur;
166 lex=a->beg[0];
167 }
168 else if ( (p->cur[0] == '"') || (p->cur[0] == '\'') )
169 {
170 /*
171 "string" or 'string' found.
172 Scan until the closing quote/doublequote, or until the END-OF-INPUT.
173 */
174 p->cur++;
175 for (; ( p->cur < p->end ) && (p->cur[0] != a->beg[0]); p->cur++)
176 {}
177 a->end=p->cur;
178 if (p->cur < p->end) /* Closing quote or doublequote has been found */
179 p->cur++;
180 a->beg++;
181 if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION))
182 my_xml_norm_text(a);
183 lex=MY_XML_STRING;
184 }
185 else if (my_xml_is_id0(p->cur[0]))
186 {
187 p->cur++;
188 while (p->cur < p->end && my_xml_is_id1(p->cur[0]))
189 p->cur++;
190 a->end=p->cur;
191 my_xml_norm_text(a);
192 lex=MY_XML_IDENT;
193 }
194 else
195 lex= MY_XML_UNKNOWN;
196
197#if 0
198 printf("LEX=%s[%d]\n",lex2str(lex),a->end-a->beg);
199#endif
200
201ret:
202 return lex;
203}
204
205
206static int my_xml_value(MY_XML_PARSER *st, const char *str, size_t len)
207{
208 return (st->value) ? (st->value)(st,str,len) : MY_XML_OK;
209}
210
211
212/**
213 Ensure the attr buffer is wide enough to hold the new value
214
215 Expand and/or allocate dynamic buffer as needed to hold the concatenated
216 path and the terminating zero.
217
218 @attr st the parser instance
219 @attr len the length of the attribute to be added
220 @return state
221 @retval 1 failed
222 @retval 0 success
223*/
224static int my_xml_attr_ensure_space(MY_XML_PARSER *st, size_t len)
225{
226 size_t ofs= st->attr.end - st->attr.start;
227 len++; // Add terminating zero.
228 if (ofs + len > st->attr.buffer_size)
229 {
230 st->attr.buffer_size= (SIZE_T_MAX - len) / 2 > st->attr.buffer_size ?
231 st->attr.buffer_size * 2 + len : SIZE_T_MAX;
232
233 if (!st->attr.buffer)
234 {
235 st->attr.buffer= (char *) my_malloc(st->attr.buffer_size, MYF(0));
236 if (st->attr.buffer)
237 memcpy(st->attr.buffer, st->attr.static_buffer, ofs + 1 /*term. zero */);
238 }
239 else
240 st->attr.buffer= (char *) my_realloc(st->attr.buffer,
241 st->attr.buffer_size, MYF(0));
242 st->attr.start= st->attr.buffer;
243 st->attr.end= st->attr.start + ofs;
244
245 return st->attr.buffer ? MY_XML_OK : MY_XML_ERROR;
246 }
247 return MY_XML_OK;
248}
249
250
251/** rewind the attr buffer to initial state */
252static void my_xml_attr_rewind(MY_XML_PARSER *p)
253{
254 /* keep the buffer already allocated */
255 p->attr.end= p->attr.start;
256}
257
258
259static int my_xml_enter(MY_XML_PARSER *st, const char *str, size_t len)
260{
261 if (my_xml_attr_ensure_space(st, len + 1 /* the separator char */))
262 return MY_XML_ERROR;
263
264 if (st->attr.end > st->attr.start)
265 {
266 st->attr.end[0]= '/';
267 st->attr.end++;
268 }
269 memcpy(st->attr.end, str, len);
270 st->attr.end+= len;
271 st->attr.end[0]= '\0';
272 if (st->flags & MY_XML_FLAG_RELATIVE_NAMES)
273 return st->enter ? st->enter(st, str, len) : MY_XML_OK;
274 else
275 return st->enter ?
276 st->enter(st, st->attr.start, st->attr.end - st->attr.start) : MY_XML_OK;
277}
278
279
280static void mstr(char *s,const char *src,size_t l1, size_t l2)
281{
282 l1 = l1<l2 ? l1 : l2;
283 memcpy(s,src,l1);
284 s[l1]='\0';
285}
286
287
288static int my_xml_leave(MY_XML_PARSER *p, const char *str, size_t slen)
289{
290 char *e, *tag;
291 size_t glen;
292 char s[32];
293 char g[32];
294 int rc;
295
296 /* Find previous '/' or beginning */
297 for (e= p->attr.end; (e > p->attr.start) && (e[0] != '/') ; e--);
298 glen= (size_t) ((e[0] == '/') ? (p->attr.end - e - 1) : p->attr.end - e);
299 tag= e[0] == '/' ? e + 1 : e;
300
301 if (str && (slen != glen || memcmp(str, tag, slen)))
302 {
303 mstr(s,str,sizeof(s)-1,slen);
304 if (glen)
305 {
306 mstr(g, tag, sizeof(g)-1, glen);
307 sprintf(p->errstr,"'</%s>' unexpected ('</%s>' wanted)",s,g);
308 }
309 else
310 sprintf(p->errstr,"'</%s>' unexpected (END-OF-INPUT wanted)", s);
311 return MY_XML_ERROR;
312 }
313
314 if (p->flags & MY_XML_FLAG_RELATIVE_NAMES)
315 rc= p->leave_xml ? p->leave_xml(p, str, slen) : MY_XML_OK;
316 else
317 rc= (p->leave_xml ?
318 p->leave_xml(p, p->attr.start, p->attr.end - p->attr.start) :
319 MY_XML_OK);
320
321 *e='\0';
322 p->attr.end= e;
323
324 return rc;
325}
326
327
328int my_xml_parse(MY_XML_PARSER *p,const char *str, size_t len)
329{
330
331 my_xml_attr_rewind(p);
332
333 p->beg=str;
334 p->cur=str;
335 p->end=str+len;
336
337 while ( p->cur < p->end )
338 {
339 MY_XML_ATTR a;
340 if (p->cur[0] == '<')
341 {
342 int lex;
343 int question=0;
344 int exclam=0;
345
346 lex=my_xml_scan(p,&a);
347
348 if (MY_XML_COMMENT == lex)
349 continue;
350
351 if (lex == MY_XML_CDATA)
352 {
353 a.beg+= 9;
354 a.end-= 3;
355 my_xml_value(p, a.beg, (size_t) (a.end-a.beg));
356 continue;
357 }
358
359 lex=my_xml_scan(p,&a);
360
361 if (MY_XML_SLASH == lex)
362 {
363 if (MY_XML_IDENT != (lex=my_xml_scan(p,&a)))
364 {
365 sprintf(p->errstr,"%s unexpected (ident wanted)",lex2str(lex));
366 return MY_XML_ERROR;
367 }
368 if (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg)))
369 return MY_XML_ERROR;
370 lex=my_xml_scan(p,&a);
371 goto gt;
372 }
373
374 if (MY_XML_EXCLAM == lex)
375 {
376 lex=my_xml_scan(p,&a);
377 exclam=1;
378 }
379 else if (MY_XML_QUESTION == lex)
380 {
381 lex=my_xml_scan(p,&a);
382 question=1;
383 }
384
385 if (MY_XML_IDENT == lex)
386 {
387 p->current_node_type= MY_XML_NODE_TAG;
388 if (MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg)))
389 return MY_XML_ERROR;
390 }
391 else
392 {
393 sprintf(p->errstr,"%s unexpected (ident or '/' wanted)",
394 lex2str(lex));
395 return MY_XML_ERROR;
396 }
397
398 while ((MY_XML_IDENT == (lex=my_xml_scan(p,&a))) ||
399 ((MY_XML_STRING == lex && exclam)))
400 {
401 MY_XML_ATTR b;
402 if (MY_XML_EQ == (lex=my_xml_scan(p,&b)))
403 {
404 lex=my_xml_scan(p,&b);
405 if ( (lex == MY_XML_IDENT) || (lex == MY_XML_STRING) )
406 {
407 p->current_node_type= MY_XML_NODE_ATTR;
408 if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) ||
409 (MY_XML_OK != my_xml_value(p,b.beg,(size_t) (b.end-b.beg))) ||
410 (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))))
411 return MY_XML_ERROR;
412 }
413 else
414 {
415 sprintf(p->errstr,"%s unexpected (ident or string wanted)",
416 lex2str(lex));
417 return MY_XML_ERROR;
418 }
419 }
420 else if (MY_XML_IDENT == lex)
421 {
422 p->current_node_type= MY_XML_NODE_ATTR;
423 if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) ||
424 (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))))
425 return MY_XML_ERROR;
426 }
427 else if ((MY_XML_STRING == lex) && exclam)
428 {
429 /*
430 We are in <!DOCTYPE>, e.g.
431 <!DOCTYPE name SYSTEM "SystemLiteral">
432 <!DOCTYPE name PUBLIC "PublidLiteral" "SystemLiteral">
433 Just skip "SystemLiteral" and "PublicidLiteral"
434 */
435 }
436 else
437 break;
438 }
439
440 if (lex == MY_XML_SLASH)
441 {
442 if (MY_XML_OK != my_xml_leave(p,NULL,0))
443 return MY_XML_ERROR;
444 lex=my_xml_scan(p,&a);
445 }
446
447gt:
448 if (question)
449 {
450 if (lex != MY_XML_QUESTION)
451 {
452 sprintf(p->errstr,"%s unexpected ('?' wanted)",lex2str(lex));
453 return MY_XML_ERROR;
454 }
455 if (MY_XML_OK != my_xml_leave(p,NULL,0))
456 return MY_XML_ERROR;
457 lex=my_xml_scan(p,&a);
458 }
459
460 if (exclam)
461 {
462 if (MY_XML_OK != my_xml_leave(p,NULL,0))
463 return MY_XML_ERROR;
464 }
465
466 if (lex != MY_XML_GT)
467 {
468 sprintf(p->errstr,"%s unexpected ('>' wanted)",lex2str(lex));
469 return MY_XML_ERROR;
470 }
471 }
472 else
473 {
474 a.beg=p->cur;
475 for ( ; (p->cur < p->end) && (p->cur[0] != '<') ; p->cur++);
476 a.end=p->cur;
477
478 if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION))
479 my_xml_norm_text(&a);
480 if (a.beg != a.end)
481 {
482 my_xml_value(p,a.beg,(size_t) (a.end-a.beg));
483 }
484 }
485 }
486
487 if (p->attr.start[0])
488 {
489 sprintf(p->errstr,"unexpected END-OF-INPUT");
490 return MY_XML_ERROR;
491 }
492 return MY_XML_OK;
493}
494
495
496void my_xml_parser_create(MY_XML_PARSER *p)
497{
498 memset(p, 0, sizeof(p[0]));
499 /*
500 Use static buffer while it's sufficient.
501 */
502 p->attr.start= p->attr.end= p->attr.static_buffer;
503 p->attr.buffer_size= sizeof(p->attr.static_buffer);
504}
505
506
507void my_xml_parser_free(MY_XML_PARSER *p)
508{
509 if (p->attr.buffer)
510 {
511 my_free(p->attr.buffer);
512 p->attr.buffer= NULL;
513 }
514}
515
516
517void my_xml_set_value_handler(MY_XML_PARSER *p,
518 int (*action)(MY_XML_PARSER *p, const char *s,
519 size_t l))
520{
521 p->value=action;
522}
523
524void my_xml_set_enter_handler(MY_XML_PARSER *p,
525 int (*action)(MY_XML_PARSER *p, const char *s,
526 size_t l))
527{
528 p->enter=action;
529}
530
531
532void my_xml_set_leave_handler(MY_XML_PARSER *p,
533 int (*action)(MY_XML_PARSER *p, const char *s,
534 size_t l))
535{
536 p->leave_xml=action;
537}
538
539
540void my_xml_set_user_data(MY_XML_PARSER *p, void *user_data)
541{
542 p->user_data=user_data;
543}
544
545
546const char *my_xml_error_string(MY_XML_PARSER *p)
547{
548 return p->errstr;
549}
550
551
552size_t my_xml_error_pos(MY_XML_PARSER *p)
553{
554 const char *beg=p->beg;
555 const char *s;
556 for ( s=p->beg ; s<p->cur; s++)
557 {
558 if (s[0] == '\n')
559 beg=s;
560 }
561 return (size_t) (p->cur-beg);
562}
563
564uint my_xml_error_lineno(MY_XML_PARSER *p)
565{
566 uint res=0;
567 const char *s;
568 for (s=p->beg ; s<p->cur; s++)
569 {
570 if (s[0] == '\n')
571 res++;
572 }
573 return res;
574}
575