1/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2 See the file COPYING for copying permission.
3*/
4
5/* This file is included! */
6#ifdef XML_TOK_IMPL_C
7
8#ifndef IS_INVALID_CHAR
9#define IS_INVALID_CHAR(enc, ptr, n) (0)
10#endif
11
12#define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
13 case BT_LEAD ## n: \
14 if (end - ptr < n) \
15 return XML_TOK_PARTIAL_CHAR; \
16 if (IS_INVALID_CHAR(enc, ptr, n)) { \
17 *(nextTokPtr) = (ptr); \
18 return XML_TOK_INVALID; \
19 } \
20 ptr += n; \
21 break;
22
23#define INVALID_CASES(ptr, nextTokPtr) \
24 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
25 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
26 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
27 case BT_NONXML: \
28 case BT_MALFORM: \
29 case BT_TRAIL: \
30 *(nextTokPtr) = (ptr); \
31 return XML_TOK_INVALID;
32
33#define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
34 case BT_LEAD ## n: \
35 if (end - ptr < n) \
36 return XML_TOK_PARTIAL_CHAR; \
37 if (!IS_NAME_CHAR(enc, ptr, n)) { \
38 *nextTokPtr = ptr; \
39 return XML_TOK_INVALID; \
40 } \
41 ptr += n; \
42 break;
43
44#define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
45 case BT_NONASCII: \
46 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
47 *nextTokPtr = ptr; \
48 return XML_TOK_INVALID; \
49 } \
50 case BT_NMSTRT: \
51 case BT_HEX: \
52 case BT_DIGIT: \
53 case BT_NAME: \
54 case BT_MINUS: \
55 ptr += MINBPC(enc); \
56 break; \
57 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
58 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
59 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
60
61#define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
62 case BT_LEAD ## n: \
63 if (end - ptr < n) \
64 return XML_TOK_PARTIAL_CHAR; \
65 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
66 *nextTokPtr = ptr; \
67 return XML_TOK_INVALID; \
68 } \
69 ptr += n; \
70 break;
71
72#define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
73 case BT_NONASCII: \
74 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
75 *nextTokPtr = ptr; \
76 return XML_TOK_INVALID; \
77 } \
78 case BT_NMSTRT: \
79 case BT_HEX: \
80 ptr += MINBPC(enc); \
81 break; \
82 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
83 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
84 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
85
86#ifndef PREFIX
87#define PREFIX(ident) ident
88#endif
89
90
91#define HAS_CHARS(enc, ptr, end, count) \
92 (end - ptr >= count * MINBPC(enc))
93
94#define HAS_CHAR(enc, ptr, end) \
95 HAS_CHARS(enc, ptr, end, 1)
96
97#define REQUIRE_CHARS(enc, ptr, end, count) \
98 { \
99 if (! HAS_CHARS(enc, ptr, end, count)) { \
100 return XML_TOK_PARTIAL; \
101 } \
102 }
103
104#define REQUIRE_CHAR(enc, ptr, end) \
105 REQUIRE_CHARS(enc, ptr, end, 1)
106
107
108/* ptr points to character following "<!-" */
109
110static int PTRCALL
111PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
112 const char *end, const char **nextTokPtr)
113{
114 if (HAS_CHAR(enc, ptr, end)) {
115 if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
116 *nextTokPtr = ptr;
117 return XML_TOK_INVALID;
118 }
119 ptr += MINBPC(enc);
120 while (HAS_CHAR(enc, ptr, end)) {
121 switch (BYTE_TYPE(enc, ptr)) {
122 INVALID_CASES(ptr, nextTokPtr)
123 case BT_MINUS:
124 ptr += MINBPC(enc);
125 REQUIRE_CHAR(enc, ptr, end);
126 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
127 ptr += MINBPC(enc);
128 REQUIRE_CHAR(enc, ptr, end);
129 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
130 *nextTokPtr = ptr;
131 return XML_TOK_INVALID;
132 }
133 *nextTokPtr = ptr + MINBPC(enc);
134 return XML_TOK_COMMENT;
135 }
136 break;
137 default:
138 ptr += MINBPC(enc);
139 break;
140 }
141 }
142 }
143 return XML_TOK_PARTIAL;
144}
145
146/* ptr points to character following "<!" */
147
148static int PTRCALL
149PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
150 const char *end, const char **nextTokPtr)
151{
152 REQUIRE_CHAR(enc, ptr, end);
153 switch (BYTE_TYPE(enc, ptr)) {
154 case BT_MINUS:
155 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
156 case BT_LSQB:
157 *nextTokPtr = ptr + MINBPC(enc);
158 return XML_TOK_COND_SECT_OPEN;
159 case BT_NMSTRT:
160 case BT_HEX:
161 ptr += MINBPC(enc);
162 break;
163 default:
164 *nextTokPtr = ptr;
165 return XML_TOK_INVALID;
166 }
167 while (HAS_CHAR(enc, ptr, end)) {
168 switch (BYTE_TYPE(enc, ptr)) {
169 case BT_PERCNT:
170 REQUIRE_CHARS(enc, ptr, end, 2);
171 /* don't allow <!ENTITY% foo "whatever"> */
172 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
173 case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
174 *nextTokPtr = ptr;
175 return XML_TOK_INVALID;
176 }
177 /* fall through */
178 case BT_S: case BT_CR: case BT_LF:
179 *nextTokPtr = ptr;
180 return XML_TOK_DECL_OPEN;
181 case BT_NMSTRT:
182 case BT_HEX:
183 ptr += MINBPC(enc);
184 break;
185 default:
186 *nextTokPtr = ptr;
187 return XML_TOK_INVALID;
188 }
189 }
190 return XML_TOK_PARTIAL;
191}
192
193static int PTRCALL
194PREFIX(checkPiTarget)(const ENCODING *UNUSED_P(enc), const char *ptr,
195 const char *end, int *tokPtr)
196{
197 int upper = 0;
198 *tokPtr = XML_TOK_PI;
199 if (end - ptr != MINBPC(enc)*3)
200 return 1;
201 switch (BYTE_TO_ASCII(enc, ptr)) {
202 case ASCII_x:
203 break;
204 case ASCII_X:
205 upper = 1;
206 break;
207 default:
208 return 1;
209 }
210 ptr += MINBPC(enc);
211 switch (BYTE_TO_ASCII(enc, ptr)) {
212 case ASCII_m:
213 break;
214 case ASCII_M:
215 upper = 1;
216 break;
217 default:
218 return 1;
219 }
220 ptr += MINBPC(enc);
221 switch (BYTE_TO_ASCII(enc, ptr)) {
222 case ASCII_l:
223 break;
224 case ASCII_L:
225 upper = 1;
226 break;
227 default:
228 return 1;
229 }
230 if (upper)
231 return 0;
232 *tokPtr = XML_TOK_XML_DECL;
233 return 1;
234}
235
236/* ptr points to character following "<?" */
237
238static int PTRCALL
239PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
240 const char *end, const char **nextTokPtr)
241{
242 int tok;
243 const char *target = ptr;
244 REQUIRE_CHAR(enc, ptr, end);
245 switch (BYTE_TYPE(enc, ptr)) {
246 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
247 default:
248 *nextTokPtr = ptr;
249 return XML_TOK_INVALID;
250 }
251 while (HAS_CHAR(enc, ptr, end)) {
252 switch (BYTE_TYPE(enc, ptr)) {
253 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
254 case BT_S: case BT_CR: case BT_LF:
255 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
256 *nextTokPtr = ptr;
257 return XML_TOK_INVALID;
258 }
259 ptr += MINBPC(enc);
260 while (HAS_CHAR(enc, ptr, end)) {
261 switch (BYTE_TYPE(enc, ptr)) {
262 INVALID_CASES(ptr, nextTokPtr)
263 case BT_QUEST:
264 ptr += MINBPC(enc);
265 REQUIRE_CHAR(enc, ptr, end);
266 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
267 *nextTokPtr = ptr + MINBPC(enc);
268 return tok;
269 }
270 break;
271 default:
272 ptr += MINBPC(enc);
273 break;
274 }
275 }
276 return XML_TOK_PARTIAL;
277 case BT_QUEST:
278 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
279 *nextTokPtr = ptr;
280 return XML_TOK_INVALID;
281 }
282 ptr += MINBPC(enc);
283 REQUIRE_CHAR(enc, ptr, end);
284 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
285 *nextTokPtr = ptr + MINBPC(enc);
286 return tok;
287 }
288 /* fall through */
289 default:
290 *nextTokPtr = ptr;
291 return XML_TOK_INVALID;
292 }
293 }
294 return XML_TOK_PARTIAL;
295}
296
297static int PTRCALL
298PREFIX(scanCdataSection)(const ENCODING *UNUSED_P(enc), const char *ptr,
299 const char *end, const char **nextTokPtr)
300{
301 static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
302 ASCII_T, ASCII_A, ASCII_LSQB };
303 int i;
304 /* CDATA[ */
305 REQUIRE_CHARS(enc, ptr, end, 6);
306 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
307 if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
308 *nextTokPtr = ptr;
309 return XML_TOK_INVALID;
310 }
311 }
312 *nextTokPtr = ptr;
313 return XML_TOK_CDATA_SECT_OPEN;
314}
315
316static int PTRCALL
317PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
318 const char *end, const char **nextTokPtr)
319{
320 if (ptr >= end)
321 return XML_TOK_NONE;
322 if (MINBPC(enc) > 1) {
323 size_t n = end - ptr;
324 if (n & (MINBPC(enc) - 1)) {
325 n &= ~(MINBPC(enc) - 1);
326 if (n == 0)
327 return XML_TOK_PARTIAL;
328 end = ptr + n;
329 }
330 }
331 switch (BYTE_TYPE(enc, ptr)) {
332 case BT_RSQB:
333 ptr += MINBPC(enc);
334 REQUIRE_CHAR(enc, ptr, end);
335 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
336 break;
337 ptr += MINBPC(enc);
338 REQUIRE_CHAR(enc, ptr, end);
339 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
340 ptr -= MINBPC(enc);
341 break;
342 }
343 *nextTokPtr = ptr + MINBPC(enc);
344 return XML_TOK_CDATA_SECT_CLOSE;
345 case BT_CR:
346 ptr += MINBPC(enc);
347 REQUIRE_CHAR(enc, ptr, end);
348 if (BYTE_TYPE(enc, ptr) == BT_LF)
349 ptr += MINBPC(enc);
350 *nextTokPtr = ptr;
351 return XML_TOK_DATA_NEWLINE;
352 case BT_LF:
353 *nextTokPtr = ptr + MINBPC(enc);
354 return XML_TOK_DATA_NEWLINE;
355 INVALID_CASES(ptr, nextTokPtr)
356 default:
357 ptr += MINBPC(enc);
358 break;
359 }
360 while (HAS_CHAR(enc, ptr, end)) {
361 switch (BYTE_TYPE(enc, ptr)) {
362#define LEAD_CASE(n) \
363 case BT_LEAD ## n: \
364 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
365 *nextTokPtr = ptr; \
366 return XML_TOK_DATA_CHARS; \
367 } \
368 ptr += n; \
369 break;
370 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
371#undef LEAD_CASE
372 case BT_NONXML:
373 case BT_MALFORM:
374 case BT_TRAIL:
375 case BT_CR:
376 case BT_LF:
377 case BT_RSQB:
378 *nextTokPtr = ptr;
379 return XML_TOK_DATA_CHARS;
380 default:
381 ptr += MINBPC(enc);
382 break;
383 }
384 }
385 *nextTokPtr = ptr;
386 return XML_TOK_DATA_CHARS;
387}
388
389/* ptr points to character following "</" */
390
391static int PTRCALL
392PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
393 const char *end, const char **nextTokPtr)
394{
395 REQUIRE_CHAR(enc, ptr, end);
396 switch (BYTE_TYPE(enc, ptr)) {
397 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
398 default:
399 *nextTokPtr = ptr;
400 return XML_TOK_INVALID;
401 }
402 while (HAS_CHAR(enc, ptr, end)) {
403 switch (BYTE_TYPE(enc, ptr)) {
404 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
405 case BT_S: case BT_CR: case BT_LF:
406 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
407 switch (BYTE_TYPE(enc, ptr)) {
408 case BT_S: case BT_CR: case BT_LF:
409 break;
410 case BT_GT:
411 *nextTokPtr = ptr + MINBPC(enc);
412 return XML_TOK_END_TAG;
413 default:
414 *nextTokPtr = ptr;
415 return XML_TOK_INVALID;
416 }
417 }
418 return XML_TOK_PARTIAL;
419#ifdef XML_NS
420 case BT_COLON:
421 /* no need to check qname syntax here,
422 since end-tag must match exactly */
423 ptr += MINBPC(enc);
424 break;
425#endif
426 case BT_GT:
427 *nextTokPtr = ptr + MINBPC(enc);
428 return XML_TOK_END_TAG;
429 default:
430 *nextTokPtr = ptr;
431 return XML_TOK_INVALID;
432 }
433 }
434 return XML_TOK_PARTIAL;
435}
436
437/* ptr points to character following "&#X" */
438
439static int PTRCALL
440PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
441 const char *end, const char **nextTokPtr)
442{
443 if (HAS_CHAR(enc, ptr, end)) {
444 switch (BYTE_TYPE(enc, ptr)) {
445 case BT_DIGIT:
446 case BT_HEX:
447 break;
448 default:
449 *nextTokPtr = ptr;
450 return XML_TOK_INVALID;
451 }
452 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
453 switch (BYTE_TYPE(enc, ptr)) {
454 case BT_DIGIT:
455 case BT_HEX:
456 break;
457 case BT_SEMI:
458 *nextTokPtr = ptr + MINBPC(enc);
459 return XML_TOK_CHAR_REF;
460 default:
461 *nextTokPtr = ptr;
462 return XML_TOK_INVALID;
463 }
464 }
465 }
466 return XML_TOK_PARTIAL;
467}
468
469/* ptr points to character following "&#" */
470
471static int PTRCALL
472PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
473 const char *end, const char **nextTokPtr)
474{
475 if (HAS_CHAR(enc, ptr, end)) {
476 if (CHAR_MATCHES(enc, ptr, ASCII_x))
477 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
478 switch (BYTE_TYPE(enc, ptr)) {
479 case BT_DIGIT:
480 break;
481 default:
482 *nextTokPtr = ptr;
483 return XML_TOK_INVALID;
484 }
485 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
486 switch (BYTE_TYPE(enc, ptr)) {
487 case BT_DIGIT:
488 break;
489 case BT_SEMI:
490 *nextTokPtr = ptr + MINBPC(enc);
491 return XML_TOK_CHAR_REF;
492 default:
493 *nextTokPtr = ptr;
494 return XML_TOK_INVALID;
495 }
496 }
497 }
498 return XML_TOK_PARTIAL;
499}
500
501/* ptr points to character following "&" */
502
503static int PTRCALL
504PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
505 const char **nextTokPtr)
506{
507 REQUIRE_CHAR(enc, ptr, end);
508 switch (BYTE_TYPE(enc, ptr)) {
509 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
510 case BT_NUM:
511 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
512 default:
513 *nextTokPtr = ptr;
514 return XML_TOK_INVALID;
515 }
516 while (HAS_CHAR(enc, ptr, end)) {
517 switch (BYTE_TYPE(enc, ptr)) {
518 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
519 case BT_SEMI:
520 *nextTokPtr = ptr + MINBPC(enc);
521 return XML_TOK_ENTITY_REF;
522 default:
523 *nextTokPtr = ptr;
524 return XML_TOK_INVALID;
525 }
526 }
527 return XML_TOK_PARTIAL;
528}
529
530/* ptr points to character following first character of attribute name */
531
532static int PTRCALL
533PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
534 const char **nextTokPtr)
535{
536#ifdef XML_NS
537 int hadColon = 0;
538#endif
539 while (HAS_CHAR(enc, ptr, end)) {
540 switch (BYTE_TYPE(enc, ptr)) {
541 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
542#ifdef XML_NS
543 case BT_COLON:
544 if (hadColon) {
545 *nextTokPtr = ptr;
546 return XML_TOK_INVALID;
547 }
548 hadColon = 1;
549 ptr += MINBPC(enc);
550 REQUIRE_CHAR(enc, ptr, end);
551 switch (BYTE_TYPE(enc, ptr)) {
552 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
553 default:
554 *nextTokPtr = ptr;
555 return XML_TOK_INVALID;
556 }
557 break;
558#endif
559 case BT_S: case BT_CR: case BT_LF:
560 for (;;) {
561 int t;
562
563 ptr += MINBPC(enc);
564 REQUIRE_CHAR(enc, ptr, end);
565 t = BYTE_TYPE(enc, ptr);
566 if (t == BT_EQUALS)
567 break;
568 switch (t) {
569 case BT_S:
570 case BT_LF:
571 case BT_CR:
572 break;
573 default:
574 *nextTokPtr = ptr;
575 return XML_TOK_INVALID;
576 }
577 }
578 /* fall through */
579 case BT_EQUALS:
580 {
581 int open;
582#ifdef XML_NS
583 hadColon = 0;
584#endif
585 for (;;) {
586 ptr += MINBPC(enc);
587 REQUIRE_CHAR(enc, ptr, end);
588 open = BYTE_TYPE(enc, ptr);
589 if (open == BT_QUOT || open == BT_APOS)
590 break;
591 switch (open) {
592 case BT_S:
593 case BT_LF:
594 case BT_CR:
595 break;
596 default:
597 *nextTokPtr = ptr;
598 return XML_TOK_INVALID;
599 }
600 }
601 ptr += MINBPC(enc);
602 /* in attribute value */
603 for (;;) {
604 int t;
605 REQUIRE_CHAR(enc, ptr, end);
606 t = BYTE_TYPE(enc, ptr);
607 if (t == open)
608 break;
609 switch (t) {
610 INVALID_CASES(ptr, nextTokPtr)
611 case BT_AMP:
612 {
613 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
614 if (tok <= 0) {
615 if (tok == XML_TOK_INVALID)
616 *nextTokPtr = ptr;
617 return tok;
618 }
619 break;
620 }
621 case BT_LT:
622 *nextTokPtr = ptr;
623 return XML_TOK_INVALID;
624 default:
625 ptr += MINBPC(enc);
626 break;
627 }
628 }
629 ptr += MINBPC(enc);
630 REQUIRE_CHAR(enc, ptr, end);
631 switch (BYTE_TYPE(enc, ptr)) {
632 case BT_S:
633 case BT_CR:
634 case BT_LF:
635 break;
636 case BT_SOL:
637 goto sol;
638 case BT_GT:
639 goto gt;
640 default:
641 *nextTokPtr = ptr;
642 return XML_TOK_INVALID;
643 }
644 /* ptr points to closing quote */
645 for (;;) {
646 ptr += MINBPC(enc);
647 REQUIRE_CHAR(enc, ptr, end);
648 switch (BYTE_TYPE(enc, ptr)) {
649 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
650 case BT_S: case BT_CR: case BT_LF:
651 continue;
652 case BT_GT:
653 gt:
654 *nextTokPtr = ptr + MINBPC(enc);
655 return XML_TOK_START_TAG_WITH_ATTS;
656 case BT_SOL:
657 sol:
658 ptr += MINBPC(enc);
659 REQUIRE_CHAR(enc, ptr, end);
660 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
661 *nextTokPtr = ptr;
662 return XML_TOK_INVALID;
663 }
664 *nextTokPtr = ptr + MINBPC(enc);
665 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
666 default:
667 *nextTokPtr = ptr;
668 return XML_TOK_INVALID;
669 }
670 break;
671 }
672 break;
673 }
674 default:
675 *nextTokPtr = ptr;
676 return XML_TOK_INVALID;
677 }
678 }
679 return XML_TOK_PARTIAL;
680}
681
682/* ptr points to character following "<" */
683
684static int PTRCALL
685PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
686 const char **nextTokPtr)
687{
688#ifdef XML_NS
689 int hadColon;
690#endif
691 REQUIRE_CHAR(enc, ptr, end);
692 switch (BYTE_TYPE(enc, ptr)) {
693 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
694 case BT_EXCL:
695 ptr += MINBPC(enc);
696 REQUIRE_CHAR(enc, ptr, end);
697 switch (BYTE_TYPE(enc, ptr)) {
698 case BT_MINUS:
699 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
700 case BT_LSQB:
701 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
702 end, nextTokPtr);
703 }
704 *nextTokPtr = ptr;
705 return XML_TOK_INVALID;
706 case BT_QUEST:
707 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
708 case BT_SOL:
709 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
710 default:
711 *nextTokPtr = ptr;
712 return XML_TOK_INVALID;
713 }
714#ifdef XML_NS
715 hadColon = 0;
716#endif
717 /* we have a start-tag */
718 while (HAS_CHAR(enc, ptr, end)) {
719 switch (BYTE_TYPE(enc, ptr)) {
720 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
721#ifdef XML_NS
722 case BT_COLON:
723 if (hadColon) {
724 *nextTokPtr = ptr;
725 return XML_TOK_INVALID;
726 }
727 hadColon = 1;
728 ptr += MINBPC(enc);
729 REQUIRE_CHAR(enc, ptr, end);
730 switch (BYTE_TYPE(enc, ptr)) {
731 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
732 default:
733 *nextTokPtr = ptr;
734 return XML_TOK_INVALID;
735 }
736 break;
737#endif
738 case BT_S: case BT_CR: case BT_LF:
739 {
740 ptr += MINBPC(enc);
741 while (HAS_CHAR(enc, ptr, end)) {
742 switch (BYTE_TYPE(enc, ptr)) {
743 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
744 case BT_GT:
745 goto gt;
746 case BT_SOL:
747 goto sol;
748 case BT_S: case BT_CR: case BT_LF:
749 ptr += MINBPC(enc);
750 continue;
751 default:
752 *nextTokPtr = ptr;
753 return XML_TOK_INVALID;
754 }
755 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
756 }
757 return XML_TOK_PARTIAL;
758 }
759 case BT_GT:
760 gt:
761 *nextTokPtr = ptr + MINBPC(enc);
762 return XML_TOK_START_TAG_NO_ATTS;
763 case BT_SOL:
764 sol:
765 ptr += MINBPC(enc);
766 REQUIRE_CHAR(enc, ptr, end);
767 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
768 *nextTokPtr = ptr;
769 return XML_TOK_INVALID;
770 }
771 *nextTokPtr = ptr + MINBPC(enc);
772 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
773 default:
774 *nextTokPtr = ptr;
775 return XML_TOK_INVALID;
776 }
777 }
778 return XML_TOK_PARTIAL;
779}
780
781static int PTRCALL
782PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
783 const char **nextTokPtr)
784{
785 if (ptr >= end)
786 return XML_TOK_NONE;
787 if (MINBPC(enc) > 1) {
788 size_t n = end - ptr;
789 if (n & (MINBPC(enc) - 1)) {
790 n &= ~(MINBPC(enc) - 1);
791 if (n == 0)
792 return XML_TOK_PARTIAL;
793 end = ptr + n;
794 }
795 }
796 switch (BYTE_TYPE(enc, ptr)) {
797 case BT_LT:
798 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
799 case BT_AMP:
800 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
801 case BT_CR:
802 ptr += MINBPC(enc);
803 if (! HAS_CHAR(enc, ptr, end))
804 return XML_TOK_TRAILING_CR;
805 if (BYTE_TYPE(enc, ptr) == BT_LF)
806 ptr += MINBPC(enc);
807 *nextTokPtr = ptr;
808 return XML_TOK_DATA_NEWLINE;
809 case BT_LF:
810 *nextTokPtr = ptr + MINBPC(enc);
811 return XML_TOK_DATA_NEWLINE;
812 case BT_RSQB:
813 ptr += MINBPC(enc);
814 if (! HAS_CHAR(enc, ptr, end))
815 return XML_TOK_TRAILING_RSQB;
816 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
817 break;
818 ptr += MINBPC(enc);
819 if (! HAS_CHAR(enc, ptr, end))
820 return XML_TOK_TRAILING_RSQB;
821 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
822 ptr -= MINBPC(enc);
823 break;
824 }
825 *nextTokPtr = ptr;
826 return XML_TOK_INVALID;
827 INVALID_CASES(ptr, nextTokPtr)
828 default:
829 ptr += MINBPC(enc);
830 break;
831 }
832 while (HAS_CHAR(enc, ptr, end)) {
833 switch (BYTE_TYPE(enc, ptr)) {
834#define LEAD_CASE(n) \
835 case BT_LEAD ## n: \
836 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
837 *nextTokPtr = ptr; \
838 return XML_TOK_DATA_CHARS; \
839 } \
840 ptr += n; \
841 break;
842 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
843#undef LEAD_CASE
844 case BT_RSQB:
845 if (HAS_CHARS(enc, ptr, end, 2)) {
846 if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
847 ptr += MINBPC(enc);
848 break;
849 }
850 if (HAS_CHARS(enc, ptr, end, 3)) {
851 if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
852 ptr += MINBPC(enc);
853 break;
854 }
855 *nextTokPtr = ptr + 2*MINBPC(enc);
856 return XML_TOK_INVALID;
857 }
858 }
859 /* fall through */
860 case BT_AMP:
861 case BT_LT:
862 case BT_NONXML:
863 case BT_MALFORM:
864 case BT_TRAIL:
865 case BT_CR:
866 case BT_LF:
867 *nextTokPtr = ptr;
868 return XML_TOK_DATA_CHARS;
869 default:
870 ptr += MINBPC(enc);
871 break;
872 }
873 }
874 *nextTokPtr = ptr;
875 return XML_TOK_DATA_CHARS;
876}
877
878/* ptr points to character following "%" */
879
880static int PTRCALL
881PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
882 const char **nextTokPtr)
883{
884 REQUIRE_CHAR(enc, ptr, end);
885 switch (BYTE_TYPE(enc, ptr)) {
886 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
887 case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
888 *nextTokPtr = ptr;
889 return XML_TOK_PERCENT;
890 default:
891 *nextTokPtr = ptr;
892 return XML_TOK_INVALID;
893 }
894 while (HAS_CHAR(enc, ptr, end)) {
895 switch (BYTE_TYPE(enc, ptr)) {
896 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
897 case BT_SEMI:
898 *nextTokPtr = ptr + MINBPC(enc);
899 return XML_TOK_PARAM_ENTITY_REF;
900 default:
901 *nextTokPtr = ptr;
902 return XML_TOK_INVALID;
903 }
904 }
905 return XML_TOK_PARTIAL;
906}
907
908static int PTRCALL
909PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
910 const char **nextTokPtr)
911{
912 REQUIRE_CHAR(enc, ptr, end);
913 switch (BYTE_TYPE(enc, ptr)) {
914 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
915 default:
916 *nextTokPtr = ptr;
917 return XML_TOK_INVALID;
918 }
919 while (HAS_CHAR(enc, ptr, end)) {
920 switch (BYTE_TYPE(enc, ptr)) {
921 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
922 case BT_CR: case BT_LF: case BT_S:
923 case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
924 *nextTokPtr = ptr;
925 return XML_TOK_POUND_NAME;
926 default:
927 *nextTokPtr = ptr;
928 return XML_TOK_INVALID;
929 }
930 }
931 return -XML_TOK_POUND_NAME;
932}
933
934static int PTRCALL
935PREFIX(scanLit)(int open, const ENCODING *enc,
936 const char *ptr, const char *end,
937 const char **nextTokPtr)
938{
939 while (HAS_CHAR(enc, ptr, end)) {
940 int t = BYTE_TYPE(enc, ptr);
941 switch (t) {
942 INVALID_CASES(ptr, nextTokPtr)
943 case BT_QUOT:
944 case BT_APOS:
945 ptr += MINBPC(enc);
946 if (t != open)
947 break;
948 if (! HAS_CHAR(enc, ptr, end))
949 return -XML_TOK_LITERAL;
950 *nextTokPtr = ptr;
951 switch (BYTE_TYPE(enc, ptr)) {
952 case BT_S: case BT_CR: case BT_LF:
953 case BT_GT: case BT_PERCNT: case BT_LSQB:
954 return XML_TOK_LITERAL;
955 default:
956 return XML_TOK_INVALID;
957 }
958 default:
959 ptr += MINBPC(enc);
960 break;
961 }
962 }
963 return XML_TOK_PARTIAL;
964}
965
966static int PTRCALL
967PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
968 const char **nextTokPtr)
969{
970 int tok;
971 if (ptr >= end)
972 return XML_TOK_NONE;
973 if (MINBPC(enc) > 1) {
974 size_t n = end - ptr;
975 if (n & (MINBPC(enc) - 1)) {
976 n &= ~(MINBPC(enc) - 1);
977 if (n == 0)
978 return XML_TOK_PARTIAL;
979 end = ptr + n;
980 }
981 }
982 switch (BYTE_TYPE(enc, ptr)) {
983 case BT_QUOT:
984 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
985 case BT_APOS:
986 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
987 case BT_LT:
988 {
989 ptr += MINBPC(enc);
990 REQUIRE_CHAR(enc, ptr, end);
991 switch (BYTE_TYPE(enc, ptr)) {
992 case BT_EXCL:
993 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
994 case BT_QUEST:
995 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
996 case BT_NMSTRT:
997 case BT_HEX:
998 case BT_NONASCII:
999 case BT_LEAD2:
1000 case BT_LEAD3:
1001 case BT_LEAD4:
1002 *nextTokPtr = ptr - MINBPC(enc);
1003 return XML_TOK_INSTANCE_START;
1004 }
1005 *nextTokPtr = ptr;
1006 return XML_TOK_INVALID;
1007 }
1008 case BT_CR:
1009 if (ptr + MINBPC(enc) == end) {
1010 *nextTokPtr = end;
1011 /* indicate that this might be part of a CR/LF pair */
1012 return -XML_TOK_PROLOG_S;
1013 }
1014 /* fall through */
1015 case BT_S: case BT_LF:
1016 for (;;) {
1017 ptr += MINBPC(enc);
1018 if (! HAS_CHAR(enc, ptr, end))
1019 break;
1020 switch (BYTE_TYPE(enc, ptr)) {
1021 case BT_S: case BT_LF:
1022 break;
1023 case BT_CR:
1024 /* don't split CR/LF pair */
1025 if (ptr + MINBPC(enc) != end)
1026 break;
1027 /* fall through */
1028 default:
1029 *nextTokPtr = ptr;
1030 return XML_TOK_PROLOG_S;
1031 }
1032 }
1033 *nextTokPtr = ptr;
1034 return XML_TOK_PROLOG_S;
1035 case BT_PERCNT:
1036 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1037 case BT_COMMA:
1038 *nextTokPtr = ptr + MINBPC(enc);
1039 return XML_TOK_COMMA;
1040 case BT_LSQB:
1041 *nextTokPtr = ptr + MINBPC(enc);
1042 return XML_TOK_OPEN_BRACKET;
1043 case BT_RSQB:
1044 ptr += MINBPC(enc);
1045 if (! HAS_CHAR(enc, ptr, end))
1046 return -XML_TOK_CLOSE_BRACKET;
1047 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1048 REQUIRE_CHARS(enc, ptr, end, 2);
1049 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1050 *nextTokPtr = ptr + 2*MINBPC(enc);
1051 return XML_TOK_COND_SECT_CLOSE;
1052 }
1053 }
1054 *nextTokPtr = ptr;
1055 return XML_TOK_CLOSE_BRACKET;
1056 case BT_LPAR:
1057 *nextTokPtr = ptr + MINBPC(enc);
1058 return XML_TOK_OPEN_PAREN;
1059 case BT_RPAR:
1060 ptr += MINBPC(enc);
1061 if (! HAS_CHAR(enc, ptr, end))
1062 return -XML_TOK_CLOSE_PAREN;
1063 switch (BYTE_TYPE(enc, ptr)) {
1064 case BT_AST:
1065 *nextTokPtr = ptr + MINBPC(enc);
1066 return XML_TOK_CLOSE_PAREN_ASTERISK;
1067 case BT_QUEST:
1068 *nextTokPtr = ptr + MINBPC(enc);
1069 return XML_TOK_CLOSE_PAREN_QUESTION;
1070 case BT_PLUS:
1071 *nextTokPtr = ptr + MINBPC(enc);
1072 return XML_TOK_CLOSE_PAREN_PLUS;
1073 case BT_CR: case BT_LF: case BT_S:
1074 case BT_GT: case BT_COMMA: case BT_VERBAR:
1075 case BT_RPAR:
1076 *nextTokPtr = ptr;
1077 return XML_TOK_CLOSE_PAREN;
1078 }
1079 *nextTokPtr = ptr;
1080 return XML_TOK_INVALID;
1081 case BT_VERBAR:
1082 *nextTokPtr = ptr + MINBPC(enc);
1083 return XML_TOK_OR;
1084 case BT_GT:
1085 *nextTokPtr = ptr + MINBPC(enc);
1086 return XML_TOK_DECL_CLOSE;
1087 case BT_NUM:
1088 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1089#define LEAD_CASE(n) \
1090 case BT_LEAD ## n: \
1091 if (end - ptr < n) \
1092 return XML_TOK_PARTIAL_CHAR; \
1093 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1094 ptr += n; \
1095 tok = XML_TOK_NAME; \
1096 break; \
1097 } \
1098 if (IS_NAME_CHAR(enc, ptr, n)) { \
1099 ptr += n; \
1100 tok = XML_TOK_NMTOKEN; \
1101 break; \
1102 } \
1103 *nextTokPtr = ptr; \
1104 return XML_TOK_INVALID;
1105 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1106#undef LEAD_CASE
1107 case BT_NMSTRT:
1108 case BT_HEX:
1109 tok = XML_TOK_NAME;
1110 ptr += MINBPC(enc);
1111 break;
1112 case BT_DIGIT:
1113 case BT_NAME:
1114 case BT_MINUS:
1115#ifdef XML_NS
1116 case BT_COLON:
1117#endif
1118 tok = XML_TOK_NMTOKEN;
1119 ptr += MINBPC(enc);
1120 break;
1121 case BT_NONASCII:
1122 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1123 ptr += MINBPC(enc);
1124 tok = XML_TOK_NAME;
1125 break;
1126 }
1127 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1128 ptr += MINBPC(enc);
1129 tok = XML_TOK_NMTOKEN;
1130 break;
1131 }
1132 /* fall through */
1133 default:
1134 *nextTokPtr = ptr;
1135 return XML_TOK_INVALID;
1136 }
1137 while (HAS_CHAR(enc, ptr, end)) {
1138 switch (BYTE_TYPE(enc, ptr)) {
1139 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1140 case BT_GT: case BT_RPAR: case BT_COMMA:
1141 case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1142 case BT_S: case BT_CR: case BT_LF:
1143 *nextTokPtr = ptr;
1144 return tok;
1145#ifdef XML_NS
1146 case BT_COLON:
1147 ptr += MINBPC(enc);
1148 switch (tok) {
1149 case XML_TOK_NAME:
1150 REQUIRE_CHAR(enc, ptr, end);
1151 tok = XML_TOK_PREFIXED_NAME;
1152 switch (BYTE_TYPE(enc, ptr)) {
1153 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1154 default:
1155 tok = XML_TOK_NMTOKEN;
1156 break;
1157 }
1158 break;
1159 case XML_TOK_PREFIXED_NAME:
1160 tok = XML_TOK_NMTOKEN;
1161 break;
1162 }
1163 break;
1164#endif
1165 case BT_PLUS:
1166 if (tok == XML_TOK_NMTOKEN) {
1167 *nextTokPtr = ptr;
1168 return XML_TOK_INVALID;
1169 }
1170 *nextTokPtr = ptr + MINBPC(enc);
1171 return XML_TOK_NAME_PLUS;
1172 case BT_AST:
1173 if (tok == XML_TOK_NMTOKEN) {
1174 *nextTokPtr = ptr;
1175 return XML_TOK_INVALID;
1176 }
1177 *nextTokPtr = ptr + MINBPC(enc);
1178 return XML_TOK_NAME_ASTERISK;
1179 case BT_QUEST:
1180 if (tok == XML_TOK_NMTOKEN) {
1181 *nextTokPtr = ptr;
1182 return XML_TOK_INVALID;
1183 }
1184 *nextTokPtr = ptr + MINBPC(enc);
1185 return XML_TOK_NAME_QUESTION;
1186 default:
1187 *nextTokPtr = ptr;
1188 return XML_TOK_INVALID;
1189 }
1190 }
1191 return -tok;
1192}
1193
1194static int PTRCALL
1195PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1196 const char *end, const char **nextTokPtr)
1197{
1198 const char *start;
1199 if (ptr >= end)
1200 return XML_TOK_NONE;
1201 else if (! HAS_CHAR(enc, ptr, end)) {
1202 /* This line cannot be executed. The incoming data has already
1203 * been tokenized once, so incomplete characters like this have
1204 * already been eliminated from the input. Retaining the paranoia
1205 * check is still valuable, however.
1206 */
1207 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1208 }
1209 start = ptr;
1210 while (HAS_CHAR(enc, ptr, end)) {
1211 switch (BYTE_TYPE(enc, ptr)) {
1212#define LEAD_CASE(n) \
1213 case BT_LEAD ## n: ptr += n; break;
1214 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1215#undef LEAD_CASE
1216 case BT_AMP:
1217 if (ptr == start)
1218 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1219 *nextTokPtr = ptr;
1220 return XML_TOK_DATA_CHARS;
1221 case BT_LT:
1222 /* this is for inside entity references */
1223 *nextTokPtr = ptr;
1224 return XML_TOK_INVALID;
1225 case BT_LF:
1226 if (ptr == start) {
1227 *nextTokPtr = ptr + MINBPC(enc);
1228 return XML_TOK_DATA_NEWLINE;
1229 }
1230 *nextTokPtr = ptr;
1231 return XML_TOK_DATA_CHARS;
1232 case BT_CR:
1233 if (ptr == start) {
1234 ptr += MINBPC(enc);
1235 if (! HAS_CHAR(enc, ptr, end))
1236 return XML_TOK_TRAILING_CR;
1237 if (BYTE_TYPE(enc, ptr) == BT_LF)
1238 ptr += MINBPC(enc);
1239 *nextTokPtr = ptr;
1240 return XML_TOK_DATA_NEWLINE;
1241 }
1242 *nextTokPtr = ptr;
1243 return XML_TOK_DATA_CHARS;
1244 case BT_S:
1245 if (ptr == start) {
1246 *nextTokPtr = ptr + MINBPC(enc);
1247 return XML_TOK_ATTRIBUTE_VALUE_S;
1248 }
1249 *nextTokPtr = ptr;
1250 return XML_TOK_DATA_CHARS;
1251 default:
1252 ptr += MINBPC(enc);
1253 break;
1254 }
1255 }
1256 *nextTokPtr = ptr;
1257 return XML_TOK_DATA_CHARS;
1258}
1259
1260static int PTRCALL
1261PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1262 const char *end, const char **nextTokPtr)
1263{
1264 const char *start;
1265 if (ptr >= end)
1266 return XML_TOK_NONE;
1267 else if (! HAS_CHAR(enc, ptr, end)) {
1268 /* This line cannot be executed. The incoming data has already
1269 * been tokenized once, so incomplete characters like this have
1270 * already been eliminated from the input. Retaining the paranoia
1271 * check is still valuable, however.
1272 */
1273 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1274 }
1275 start = ptr;
1276 while (HAS_CHAR(enc, ptr, end)) {
1277 switch (BYTE_TYPE(enc, ptr)) {
1278#define LEAD_CASE(n) \
1279 case BT_LEAD ## n: ptr += n; break;
1280 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1281#undef LEAD_CASE
1282 case BT_AMP:
1283 if (ptr == start)
1284 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1285 *nextTokPtr = ptr;
1286 return XML_TOK_DATA_CHARS;
1287 case BT_PERCNT:
1288 if (ptr == start) {
1289 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1290 end, nextTokPtr);
1291 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1292 }
1293 *nextTokPtr = ptr;
1294 return XML_TOK_DATA_CHARS;
1295 case BT_LF:
1296 if (ptr == start) {
1297 *nextTokPtr = ptr + MINBPC(enc);
1298 return XML_TOK_DATA_NEWLINE;
1299 }
1300 *nextTokPtr = ptr;
1301 return XML_TOK_DATA_CHARS;
1302 case BT_CR:
1303 if (ptr == start) {
1304 ptr += MINBPC(enc);
1305 if (! HAS_CHAR(enc, ptr, end))
1306 return XML_TOK_TRAILING_CR;
1307 if (BYTE_TYPE(enc, ptr) == BT_LF)
1308 ptr += MINBPC(enc);
1309 *nextTokPtr = ptr;
1310 return XML_TOK_DATA_NEWLINE;
1311 }
1312 *nextTokPtr = ptr;
1313 return XML_TOK_DATA_CHARS;
1314 default:
1315 ptr += MINBPC(enc);
1316 break;
1317 }
1318 }
1319 *nextTokPtr = ptr;
1320 return XML_TOK_DATA_CHARS;
1321}
1322
1323#ifdef XML_DTD
1324
1325static int PTRCALL
1326PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1327 const char *end, const char **nextTokPtr)
1328{
1329 int level = 0;
1330 if (MINBPC(enc) > 1) {
1331 size_t n = end - ptr;
1332 if (n & (MINBPC(enc) - 1)) {
1333 n &= ~(MINBPC(enc) - 1);
1334 end = ptr + n;
1335 }
1336 }
1337 while (HAS_CHAR(enc, ptr, end)) {
1338 switch (BYTE_TYPE(enc, ptr)) {
1339 INVALID_CASES(ptr, nextTokPtr)
1340 case BT_LT:
1341 ptr += MINBPC(enc);
1342 REQUIRE_CHAR(enc, ptr, end);
1343 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1344 ptr += MINBPC(enc);
1345 REQUIRE_CHAR(enc, ptr, end);
1346 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1347 ++level;
1348 ptr += MINBPC(enc);
1349 }
1350 }
1351 break;
1352 case BT_RSQB:
1353 ptr += MINBPC(enc);
1354 REQUIRE_CHAR(enc, ptr, end);
1355 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1356 ptr += MINBPC(enc);
1357 REQUIRE_CHAR(enc, ptr, end);
1358 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1359 ptr += MINBPC(enc);
1360 if (level == 0) {
1361 *nextTokPtr = ptr;
1362 return XML_TOK_IGNORE_SECT;
1363 }
1364 --level;
1365 }
1366 }
1367 break;
1368 default:
1369 ptr += MINBPC(enc);
1370 break;
1371 }
1372 }
1373 return XML_TOK_PARTIAL;
1374}
1375
1376#endif /* XML_DTD */
1377
1378static int PTRCALL
1379PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1380 const char **badPtr)
1381{
1382 ptr += MINBPC(enc);
1383 end -= MINBPC(enc);
1384 for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1385 switch (BYTE_TYPE(enc, ptr)) {
1386 case BT_DIGIT:
1387 case BT_HEX:
1388 case BT_MINUS:
1389 case BT_APOS:
1390 case BT_LPAR:
1391 case BT_RPAR:
1392 case BT_PLUS:
1393 case BT_COMMA:
1394 case BT_SOL:
1395 case BT_EQUALS:
1396 case BT_QUEST:
1397 case BT_CR:
1398 case BT_LF:
1399 case BT_SEMI:
1400 case BT_EXCL:
1401 case BT_AST:
1402 case BT_PERCNT:
1403 case BT_NUM:
1404#ifdef XML_NS
1405 case BT_COLON:
1406#endif
1407 break;
1408 case BT_S:
1409 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1410 *badPtr = ptr;
1411 return 0;
1412 }
1413 break;
1414 case BT_NAME:
1415 case BT_NMSTRT:
1416 if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1417 break;
1418 default:
1419 switch (BYTE_TO_ASCII(enc, ptr)) {
1420 case 0x24: /* $ */
1421 case 0x40: /* @ */
1422 break;
1423 default:
1424 *badPtr = ptr;
1425 return 0;
1426 }
1427 break;
1428 }
1429 }
1430 return 1;
1431}
1432
1433/* This must only be called for a well-formed start-tag or empty
1434 element tag. Returns the number of attributes. Pointers to the
1435 first attsMax attributes are stored in atts.
1436*/
1437
1438static int PTRCALL
1439PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1440 int attsMax, ATTRIBUTE *atts)
1441{
1442 enum { other, inName, inValue } state = inName;
1443 int nAtts = 0;
1444 int open = 0; /* defined when state == inValue;
1445 initialization just to shut up compilers */
1446
1447 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1448 switch (BYTE_TYPE(enc, ptr)) {
1449#define START_NAME \
1450 if (state == other) { \
1451 if (nAtts < attsMax) { \
1452 atts[nAtts].name = ptr; \
1453 atts[nAtts].normalized = 1; \
1454 } \
1455 state = inName; \
1456 }
1457#define LEAD_CASE(n) \
1458 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1459 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1460#undef LEAD_CASE
1461 case BT_NONASCII:
1462 case BT_NMSTRT:
1463 case BT_HEX:
1464 START_NAME
1465 break;
1466#undef START_NAME
1467 case BT_QUOT:
1468 if (state != inValue) {
1469 if (nAtts < attsMax)
1470 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1471 state = inValue;
1472 open = BT_QUOT;
1473 }
1474 else if (open == BT_QUOT) {
1475 state = other;
1476 if (nAtts < attsMax)
1477 atts[nAtts].valueEnd = ptr;
1478 nAtts++;
1479 }
1480 break;
1481 case BT_APOS:
1482 if (state != inValue) {
1483 if (nAtts < attsMax)
1484 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1485 state = inValue;
1486 open = BT_APOS;
1487 }
1488 else if (open == BT_APOS) {
1489 state = other;
1490 if (nAtts < attsMax)
1491 atts[nAtts].valueEnd = ptr;
1492 nAtts++;
1493 }
1494 break;
1495 case BT_AMP:
1496 if (nAtts < attsMax)
1497 atts[nAtts].normalized = 0;
1498 break;
1499 case BT_S:
1500 if (state == inName)
1501 state = other;
1502 else if (state == inValue
1503 && nAtts < attsMax
1504 && atts[nAtts].normalized
1505 && (ptr == atts[nAtts].valuePtr
1506 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1507 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1508 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1509 atts[nAtts].normalized = 0;
1510 break;
1511 case BT_CR: case BT_LF:
1512 /* This case ensures that the first attribute name is counted
1513 Apart from that we could just change state on the quote. */
1514 if (state == inName)
1515 state = other;
1516 else if (state == inValue && nAtts < attsMax)
1517 atts[nAtts].normalized = 0;
1518 break;
1519 case BT_GT:
1520 case BT_SOL:
1521 if (state != inValue)
1522 return nAtts;
1523 break;
1524 default:
1525 break;
1526 }
1527 }
1528 /* not reached */
1529}
1530
1531static int PTRFASTCALL
1532PREFIX(charRefNumber)(const ENCODING *UNUSED_P(enc), const char *ptr)
1533{
1534 int result = 0;
1535 /* skip &# */
1536 ptr += 2*MINBPC(enc);
1537 if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1538 for (ptr += MINBPC(enc);
1539 !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1540 ptr += MINBPC(enc)) {
1541 int c = BYTE_TO_ASCII(enc, ptr);
1542 switch (c) {
1543 case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1544 case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1545 result <<= 4;
1546 result |= (c - ASCII_0);
1547 break;
1548 case ASCII_A: case ASCII_B: case ASCII_C:
1549 case ASCII_D: case ASCII_E: case ASCII_F:
1550 result <<= 4;
1551 result += 10 + (c - ASCII_A);
1552 break;
1553 case ASCII_a: case ASCII_b: case ASCII_c:
1554 case ASCII_d: case ASCII_e: case ASCII_f:
1555 result <<= 4;
1556 result += 10 + (c - ASCII_a);
1557 break;
1558 }
1559 if (result >= 0x110000)
1560 return -1;
1561 }
1562 }
1563 else {
1564 for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1565 int c = BYTE_TO_ASCII(enc, ptr);
1566 result *= 10;
1567 result += (c - ASCII_0);
1568 if (result >= 0x110000)
1569 return -1;
1570 }
1571 }
1572 return checkCharRefNumber(result);
1573}
1574
1575static int PTRCALL
1576PREFIX(predefinedEntityName)(const ENCODING *UNUSED_P(enc), const char *ptr,
1577 const char *end)
1578{
1579 switch ((end - ptr)/MINBPC(enc)) {
1580 case 2:
1581 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1582 switch (BYTE_TO_ASCII(enc, ptr)) {
1583 case ASCII_l:
1584 return ASCII_LT;
1585 case ASCII_g:
1586 return ASCII_GT;
1587 }
1588 }
1589 break;
1590 case 3:
1591 if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1592 ptr += MINBPC(enc);
1593 if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1594 ptr += MINBPC(enc);
1595 if (CHAR_MATCHES(enc, ptr, ASCII_p))
1596 return ASCII_AMP;
1597 }
1598 }
1599 break;
1600 case 4:
1601 switch (BYTE_TO_ASCII(enc, ptr)) {
1602 case ASCII_q:
1603 ptr += MINBPC(enc);
1604 if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1605 ptr += MINBPC(enc);
1606 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1607 ptr += MINBPC(enc);
1608 if (CHAR_MATCHES(enc, ptr, ASCII_t))
1609 return ASCII_QUOT;
1610 }
1611 }
1612 break;
1613 case ASCII_a:
1614 ptr += MINBPC(enc);
1615 if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1616 ptr += MINBPC(enc);
1617 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1618 ptr += MINBPC(enc);
1619 if (CHAR_MATCHES(enc, ptr, ASCII_s))
1620 return ASCII_APOS;
1621 }
1622 }
1623 break;
1624 }
1625 }
1626 return 0;
1627}
1628
1629/* This function does not appear to be called from anywhere within the
1630 * library code. It is used via the macro XmlSameName(), which is
1631 * defined but never used. Since it appears in the encoding function
1632 * table, removing it is not a thing to be undertaken lightly. For
1633 * the moment, we simply exclude it from coverage tests.
1634 *
1635 * LCOV_EXCL_START
1636 */
1637static int PTRCALL
1638PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1639{
1640 for (;;) {
1641 switch (BYTE_TYPE(enc, ptr1)) {
1642#define LEAD_CASE(n) \
1643 case BT_LEAD ## n: \
1644 if (*ptr1++ != *ptr2++) \
1645 return 0;
1646 LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1647#undef LEAD_CASE
1648 /* fall through */
1649 if (*ptr1++ != *ptr2++)
1650 return 0;
1651 break;
1652 case BT_NONASCII:
1653 case BT_NMSTRT:
1654#ifdef XML_NS
1655 case BT_COLON:
1656#endif
1657 case BT_HEX:
1658 case BT_DIGIT:
1659 case BT_NAME:
1660 case BT_MINUS:
1661 if (*ptr2++ != *ptr1++)
1662 return 0;
1663 if (MINBPC(enc) > 1) {
1664 if (*ptr2++ != *ptr1++)
1665 return 0;
1666 if (MINBPC(enc) > 2) {
1667 if (*ptr2++ != *ptr1++)
1668 return 0;
1669 if (MINBPC(enc) > 3) {
1670 if (*ptr2++ != *ptr1++)
1671 return 0;
1672 }
1673 }
1674 }
1675 break;
1676 default:
1677 if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1678 return 1;
1679 switch (BYTE_TYPE(enc, ptr2)) {
1680 case BT_LEAD2:
1681 case BT_LEAD3:
1682 case BT_LEAD4:
1683 case BT_NONASCII:
1684 case BT_NMSTRT:
1685#ifdef XML_NS
1686 case BT_COLON:
1687#endif
1688 case BT_HEX:
1689 case BT_DIGIT:
1690 case BT_NAME:
1691 case BT_MINUS:
1692 return 0;
1693 default:
1694 return 1;
1695 }
1696 }
1697 }
1698 /* not reached */
1699}
1700/* LCOV_EXCL_STOP */
1701
1702static int PTRCALL
1703PREFIX(nameMatchesAscii)(const ENCODING *UNUSED_P(enc), const char *ptr1,
1704 const char *end1, const char *ptr2)
1705{
1706 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1707 if (end1 - ptr1 < MINBPC(enc)) {
1708 /* This line cannot be executed. THe incoming data has already
1709 * been tokenized once, so imcomplete characters like this have
1710 * already been eliminated from the input. Retaining the
1711 * paranoia check is still valuable, however.
1712 */
1713 return 0; /* LCOV_EXCL_LINE */
1714 }
1715 if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1716 return 0;
1717 }
1718 return ptr1 == end1;
1719}
1720
1721static int PTRFASTCALL
1722PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1723{
1724 const char *start = ptr;
1725 for (;;) {
1726 switch (BYTE_TYPE(enc, ptr)) {
1727#define LEAD_CASE(n) \
1728 case BT_LEAD ## n: ptr += n; break;
1729 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1730#undef LEAD_CASE
1731 case BT_NONASCII:
1732 case BT_NMSTRT:
1733#ifdef XML_NS
1734 case BT_COLON:
1735#endif
1736 case BT_HEX:
1737 case BT_DIGIT:
1738 case BT_NAME:
1739 case BT_MINUS:
1740 ptr += MINBPC(enc);
1741 break;
1742 default:
1743 return (int)(ptr - start);
1744 }
1745 }
1746}
1747
1748static const char * PTRFASTCALL
1749PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1750{
1751 for (;;) {
1752 switch (BYTE_TYPE(enc, ptr)) {
1753 case BT_LF:
1754 case BT_CR:
1755 case BT_S:
1756 ptr += MINBPC(enc);
1757 break;
1758 default:
1759 return ptr;
1760 }
1761 }
1762}
1763
1764static void PTRCALL
1765PREFIX(updatePosition)(const ENCODING *enc,
1766 const char *ptr,
1767 const char *end,
1768 POSITION *pos)
1769{
1770 while (HAS_CHAR(enc, ptr, end)) {
1771 switch (BYTE_TYPE(enc, ptr)) {
1772#define LEAD_CASE(n) \
1773 case BT_LEAD ## n: \
1774 ptr += n; \
1775 break;
1776 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1777#undef LEAD_CASE
1778 case BT_LF:
1779 pos->columnNumber = (XML_Size)-1;
1780 pos->lineNumber++;
1781 ptr += MINBPC(enc);
1782 break;
1783 case BT_CR:
1784 pos->lineNumber++;
1785 ptr += MINBPC(enc);
1786 if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1787 ptr += MINBPC(enc);
1788 pos->columnNumber = (XML_Size)-1;
1789 break;
1790 default:
1791 ptr += MINBPC(enc);
1792 break;
1793 }
1794 pos->columnNumber++;
1795 }
1796}
1797
1798#undef DO_LEAD_CASE
1799#undef MULTIBYTE_CASES
1800#undef INVALID_CASES
1801#undef CHECK_NAME_CASE
1802#undef CHECK_NAME_CASES
1803#undef CHECK_NMSTRT_CASE
1804#undef CHECK_NMSTRT_CASES
1805
1806#endif /* XML_TOK_IMPL_C */
1807