1 | /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd |
2 | See the file COPYING for copying permission. |
3 | */ |
4 | |
5 | /* This file is included! */ |
6 | #ifdef XML_TOK_IMPL_C |
7 | |
8 | #ifndef IS_INVALID_CHAR |
9 | #define IS_INVALID_CHAR(enc, ptr, n) (0) |
10 | #endif |
11 | |
12 | #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \ |
13 | case BT_LEAD ## n: \ |
14 | if (end - ptr < n) \ |
15 | return XML_TOK_PARTIAL_CHAR; \ |
16 | if (IS_INVALID_CHAR(enc, ptr, n)) { \ |
17 | *(nextTokPtr) = (ptr); \ |
18 | return XML_TOK_INVALID; \ |
19 | } \ |
20 | ptr += n; \ |
21 | break; |
22 | |
23 | #define INVALID_CASES(ptr, nextTokPtr) \ |
24 | INVALID_LEAD_CASE(2, ptr, nextTokPtr) \ |
25 | INVALID_LEAD_CASE(3, ptr, nextTokPtr) \ |
26 | INVALID_LEAD_CASE(4, ptr, nextTokPtr) \ |
27 | case BT_NONXML: \ |
28 | case BT_MALFORM: \ |
29 | case BT_TRAIL: \ |
30 | *(nextTokPtr) = (ptr); \ |
31 | return XML_TOK_INVALID; |
32 | |
33 | #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \ |
34 | case BT_LEAD ## n: \ |
35 | if (end - ptr < n) \ |
36 | return XML_TOK_PARTIAL_CHAR; \ |
37 | if (!IS_NAME_CHAR(enc, ptr, n)) { \ |
38 | *nextTokPtr = ptr; \ |
39 | return XML_TOK_INVALID; \ |
40 | } \ |
41 | ptr += n; \ |
42 | break; |
43 | |
44 | #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \ |
45 | case BT_NONASCII: \ |
46 | if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \ |
47 | *nextTokPtr = ptr; \ |
48 | return XML_TOK_INVALID; \ |
49 | } \ |
50 | case BT_NMSTRT: \ |
51 | case BT_HEX: \ |
52 | case BT_DIGIT: \ |
53 | case BT_NAME: \ |
54 | case BT_MINUS: \ |
55 | ptr += MINBPC(enc); \ |
56 | break; \ |
57 | CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \ |
58 | CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \ |
59 | CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr) |
60 | |
61 | #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \ |
62 | case BT_LEAD ## n: \ |
63 | if (end - ptr < n) \ |
64 | return XML_TOK_PARTIAL_CHAR; \ |
65 | if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \ |
66 | *nextTokPtr = ptr; \ |
67 | return XML_TOK_INVALID; \ |
68 | } \ |
69 | ptr += n; \ |
70 | break; |
71 | |
72 | #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \ |
73 | case BT_NONASCII: \ |
74 | if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \ |
75 | *nextTokPtr = ptr; \ |
76 | return XML_TOK_INVALID; \ |
77 | } \ |
78 | case BT_NMSTRT: \ |
79 | case BT_HEX: \ |
80 | ptr += MINBPC(enc); \ |
81 | break; \ |
82 | CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \ |
83 | CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \ |
84 | CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr) |
85 | |
86 | #ifndef PREFIX |
87 | #define PREFIX(ident) ident |
88 | #endif |
89 | |
90 | /* ptr points to character following "<!-" */ |
91 | |
92 | static int PTRCALL |
93 | PREFIX(scanComment)(const ENCODING *enc, const char *ptr, |
94 | const char *end, const char **nextTokPtr) |
95 | { |
96 | if (ptr != end) { |
97 | if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) { |
98 | *nextTokPtr = ptr; |
99 | return XML_TOK_INVALID; |
100 | } |
101 | ptr += MINBPC(enc); |
102 | while (ptr != end) { |
103 | switch (BYTE_TYPE(enc, ptr)) { |
104 | INVALID_CASES(ptr, nextTokPtr) |
105 | case BT_MINUS: |
106 | if ((ptr += MINBPC(enc)) == end) |
107 | return XML_TOK_PARTIAL; |
108 | if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) { |
109 | if ((ptr += MINBPC(enc)) == end) |
110 | return XML_TOK_PARTIAL; |
111 | if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { |
112 | *nextTokPtr = ptr; |
113 | return XML_TOK_INVALID; |
114 | } |
115 | *nextTokPtr = ptr + MINBPC(enc); |
116 | return XML_TOK_COMMENT; |
117 | } |
118 | break; |
119 | default: |
120 | ptr += MINBPC(enc); |
121 | break; |
122 | } |
123 | } |
124 | } |
125 | return XML_TOK_PARTIAL; |
126 | } |
127 | |
128 | /* ptr points to character following "<!" */ |
129 | |
130 | static int PTRCALL |
131 | PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, |
132 | const char *end, const char **nextTokPtr) |
133 | { |
134 | if (ptr == end) |
135 | return XML_TOK_PARTIAL; |
136 | switch (BYTE_TYPE(enc, ptr)) { |
137 | case BT_MINUS: |
138 | return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
139 | case BT_LSQB: |
140 | *nextTokPtr = ptr + MINBPC(enc); |
141 | return XML_TOK_COND_SECT_OPEN; |
142 | case BT_NMSTRT: |
143 | case BT_HEX: |
144 | ptr += MINBPC(enc); |
145 | break; |
146 | default: |
147 | *nextTokPtr = ptr; |
148 | return XML_TOK_INVALID; |
149 | } |
150 | while (ptr != end) { |
151 | switch (BYTE_TYPE(enc, ptr)) { |
152 | case BT_PERCNT: |
153 | if (ptr + MINBPC(enc) == end) |
154 | return XML_TOK_PARTIAL; |
155 | /* don't allow <!ENTITY% foo "whatever"> */ |
156 | switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) { |
157 | case BT_S: case BT_CR: case BT_LF: case BT_PERCNT: |
158 | *nextTokPtr = ptr; |
159 | return XML_TOK_INVALID; |
160 | } |
161 | /* fall through */ |
162 | case BT_S: case BT_CR: case BT_LF: |
163 | *nextTokPtr = ptr; |
164 | return XML_TOK_DECL_OPEN; |
165 | case BT_NMSTRT: |
166 | case BT_HEX: |
167 | ptr += MINBPC(enc); |
168 | break; |
169 | default: |
170 | *nextTokPtr = ptr; |
171 | return XML_TOK_INVALID; |
172 | } |
173 | } |
174 | return XML_TOK_PARTIAL; |
175 | } |
176 | |
177 | static int PTRCALL |
178 | PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, |
179 | const char *end, int *tokPtr) |
180 | { |
181 | int upper = 0; |
182 | *tokPtr = XML_TOK_PI; |
183 | if (end - ptr != MINBPC(enc)*3) |
184 | return 1; |
185 | switch (BYTE_TO_ASCII(enc, ptr)) { |
186 | case ASCII_x: |
187 | break; |
188 | case ASCII_X: |
189 | upper = 1; |
190 | break; |
191 | default: |
192 | return 1; |
193 | } |
194 | ptr += MINBPC(enc); |
195 | switch (BYTE_TO_ASCII(enc, ptr)) { |
196 | case ASCII_m: |
197 | break; |
198 | case ASCII_M: |
199 | upper = 1; |
200 | break; |
201 | default: |
202 | return 1; |
203 | } |
204 | ptr += MINBPC(enc); |
205 | switch (BYTE_TO_ASCII(enc, ptr)) { |
206 | case ASCII_l: |
207 | break; |
208 | case ASCII_L: |
209 | upper = 1; |
210 | break; |
211 | default: |
212 | return 1; |
213 | } |
214 | if (upper) |
215 | return 0; |
216 | *tokPtr = XML_TOK_XML_DECL; |
217 | return 1; |
218 | } |
219 | |
220 | /* ptr points to character following "<?" */ |
221 | |
222 | static int PTRCALL |
223 | PREFIX(scanPi)(const ENCODING *enc, const char *ptr, |
224 | const char *end, const char **nextTokPtr) |
225 | { |
226 | int tok; |
227 | const char *target = ptr; |
228 | if (ptr == end) |
229 | return XML_TOK_PARTIAL; |
230 | switch (BYTE_TYPE(enc, ptr)) { |
231 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
232 | default: |
233 | *nextTokPtr = ptr; |
234 | return XML_TOK_INVALID; |
235 | } |
236 | while (ptr != end) { |
237 | switch (BYTE_TYPE(enc, ptr)) { |
238 | CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) |
239 | case BT_S: case BT_CR: case BT_LF: |
240 | if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { |
241 | *nextTokPtr = ptr; |
242 | return XML_TOK_INVALID; |
243 | } |
244 | ptr += MINBPC(enc); |
245 | while (ptr != end) { |
246 | switch (BYTE_TYPE(enc, ptr)) { |
247 | INVALID_CASES(ptr, nextTokPtr) |
248 | case BT_QUEST: |
249 | ptr += MINBPC(enc); |
250 | if (ptr == end) |
251 | return XML_TOK_PARTIAL; |
252 | if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { |
253 | *nextTokPtr = ptr + MINBPC(enc); |
254 | return tok; |
255 | } |
256 | break; |
257 | default: |
258 | ptr += MINBPC(enc); |
259 | break; |
260 | } |
261 | } |
262 | return XML_TOK_PARTIAL; |
263 | case BT_QUEST: |
264 | if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { |
265 | *nextTokPtr = ptr; |
266 | return XML_TOK_INVALID; |
267 | } |
268 | ptr += MINBPC(enc); |
269 | if (ptr == end) |
270 | return XML_TOK_PARTIAL; |
271 | if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { |
272 | *nextTokPtr = ptr + MINBPC(enc); |
273 | return tok; |
274 | } |
275 | /* fall through */ |
276 | default: |
277 | *nextTokPtr = ptr; |
278 | return XML_TOK_INVALID; |
279 | } |
280 | } |
281 | return XML_TOK_PARTIAL; |
282 | } |
283 | |
284 | static int PTRCALL |
285 | PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, |
286 | const char *end, const char **nextTokPtr) |
287 | { |
288 | static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A, |
289 | ASCII_T, ASCII_A, ASCII_LSQB }; |
290 | int i; |
291 | /* CDATA[ */ |
292 | if (end - ptr < 6 * MINBPC(enc)) |
293 | return XML_TOK_PARTIAL; |
294 | for (i = 0; i < 6; i++, ptr += MINBPC(enc)) { |
295 | if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) { |
296 | *nextTokPtr = ptr; |
297 | return XML_TOK_INVALID; |
298 | } |
299 | } |
300 | *nextTokPtr = ptr; |
301 | return XML_TOK_CDATA_SECT_OPEN; |
302 | } |
303 | |
304 | static int PTRCALL |
305 | PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, |
306 | const char *end, const char **nextTokPtr) |
307 | { |
308 | if (ptr == end) |
309 | return XML_TOK_NONE; |
310 | if (MINBPC(enc) > 1) { |
311 | size_t n = end - ptr; |
312 | if (n & (MINBPC(enc) - 1)) { |
313 | n &= ~(MINBPC(enc) - 1); |
314 | if (n == 0) |
315 | return XML_TOK_PARTIAL; |
316 | end = ptr + n; |
317 | } |
318 | } |
319 | switch (BYTE_TYPE(enc, ptr)) { |
320 | case BT_RSQB: |
321 | ptr += MINBPC(enc); |
322 | if (ptr == end) |
323 | return XML_TOK_PARTIAL; |
324 | if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB)) |
325 | break; |
326 | ptr += MINBPC(enc); |
327 | if (ptr == end) |
328 | return XML_TOK_PARTIAL; |
329 | if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { |
330 | ptr -= MINBPC(enc); |
331 | break; |
332 | } |
333 | *nextTokPtr = ptr + MINBPC(enc); |
334 | return XML_TOK_CDATA_SECT_CLOSE; |
335 | case BT_CR: |
336 | ptr += MINBPC(enc); |
337 | if (ptr == end) |
338 | return XML_TOK_PARTIAL; |
339 | if (BYTE_TYPE(enc, ptr) == BT_LF) |
340 | ptr += MINBPC(enc); |
341 | *nextTokPtr = ptr; |
342 | return XML_TOK_DATA_NEWLINE; |
343 | case BT_LF: |
344 | *nextTokPtr = ptr + MINBPC(enc); |
345 | return XML_TOK_DATA_NEWLINE; |
346 | INVALID_CASES(ptr, nextTokPtr) |
347 | default: |
348 | ptr += MINBPC(enc); |
349 | break; |
350 | } |
351 | while (ptr != end) { |
352 | switch (BYTE_TYPE(enc, ptr)) { |
353 | #define LEAD_CASE(n) \ |
354 | case BT_LEAD ## n: \ |
355 | if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ |
356 | *nextTokPtr = ptr; \ |
357 | return XML_TOK_DATA_CHARS; \ |
358 | } \ |
359 | ptr += n; \ |
360 | break; |
361 | LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) |
362 | #undef LEAD_CASE |
363 | case BT_NONXML: |
364 | case BT_MALFORM: |
365 | case BT_TRAIL: |
366 | case BT_CR: |
367 | case BT_LF: |
368 | case BT_RSQB: |
369 | *nextTokPtr = ptr; |
370 | return XML_TOK_DATA_CHARS; |
371 | default: |
372 | ptr += MINBPC(enc); |
373 | break; |
374 | } |
375 | } |
376 | *nextTokPtr = ptr; |
377 | return XML_TOK_DATA_CHARS; |
378 | } |
379 | |
380 | /* ptr points to character following "</" */ |
381 | |
382 | static int PTRCALL |
383 | PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, |
384 | const char *end, const char **nextTokPtr) |
385 | { |
386 | if (ptr == end) |
387 | return XML_TOK_PARTIAL; |
388 | switch (BYTE_TYPE(enc, ptr)) { |
389 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
390 | default: |
391 | *nextTokPtr = ptr; |
392 | return XML_TOK_INVALID; |
393 | } |
394 | while (ptr != end) { |
395 | switch (BYTE_TYPE(enc, ptr)) { |
396 | CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) |
397 | case BT_S: case BT_CR: case BT_LF: |
398 | for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) { |
399 | switch (BYTE_TYPE(enc, ptr)) { |
400 | case BT_S: case BT_CR: case BT_LF: |
401 | break; |
402 | case BT_GT: |
403 | *nextTokPtr = ptr + MINBPC(enc); |
404 | return XML_TOK_END_TAG; |
405 | default: |
406 | *nextTokPtr = ptr; |
407 | return XML_TOK_INVALID; |
408 | } |
409 | } |
410 | return XML_TOK_PARTIAL; |
411 | #ifdef XML_NS |
412 | case BT_COLON: |
413 | /* no need to check qname syntax here, |
414 | since end-tag must match exactly */ |
415 | ptr += MINBPC(enc); |
416 | break; |
417 | #endif |
418 | case BT_GT: |
419 | *nextTokPtr = ptr + MINBPC(enc); |
420 | return XML_TOK_END_TAG; |
421 | default: |
422 | *nextTokPtr = ptr; |
423 | return XML_TOK_INVALID; |
424 | } |
425 | } |
426 | return XML_TOK_PARTIAL; |
427 | } |
428 | |
429 | /* ptr points to character following "&#X" */ |
430 | |
431 | static int PTRCALL |
432 | PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, |
433 | const char *end, const char **nextTokPtr) |
434 | { |
435 | if (ptr != end) { |
436 | switch (BYTE_TYPE(enc, ptr)) { |
437 | case BT_DIGIT: |
438 | case BT_HEX: |
439 | break; |
440 | default: |
441 | *nextTokPtr = ptr; |
442 | return XML_TOK_INVALID; |
443 | } |
444 | for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) { |
445 | switch (BYTE_TYPE(enc, ptr)) { |
446 | case BT_DIGIT: |
447 | case BT_HEX: |
448 | break; |
449 | case BT_SEMI: |
450 | *nextTokPtr = ptr + MINBPC(enc); |
451 | return XML_TOK_CHAR_REF; |
452 | default: |
453 | *nextTokPtr = ptr; |
454 | return XML_TOK_INVALID; |
455 | } |
456 | } |
457 | } |
458 | return XML_TOK_PARTIAL; |
459 | } |
460 | |
461 | /* ptr points to character following "&#" */ |
462 | |
463 | static int PTRCALL |
464 | PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, |
465 | const char *end, const char **nextTokPtr) |
466 | { |
467 | if (ptr != end) { |
468 | if (CHAR_MATCHES(enc, ptr, ASCII_x)) |
469 | return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
470 | switch (BYTE_TYPE(enc, ptr)) { |
471 | case BT_DIGIT: |
472 | break; |
473 | default: |
474 | *nextTokPtr = ptr; |
475 | return XML_TOK_INVALID; |
476 | } |
477 | for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) { |
478 | switch (BYTE_TYPE(enc, ptr)) { |
479 | case BT_DIGIT: |
480 | break; |
481 | case BT_SEMI: |
482 | *nextTokPtr = ptr + MINBPC(enc); |
483 | return XML_TOK_CHAR_REF; |
484 | default: |
485 | *nextTokPtr = ptr; |
486 | return XML_TOK_INVALID; |
487 | } |
488 | } |
489 | } |
490 | return XML_TOK_PARTIAL; |
491 | } |
492 | |
493 | /* ptr points to character following "&" */ |
494 | |
495 | static int PTRCALL |
496 | PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end, |
497 | const char **nextTokPtr) |
498 | { |
499 | if (ptr == end) |
500 | return XML_TOK_PARTIAL; |
501 | switch (BYTE_TYPE(enc, ptr)) { |
502 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
503 | case BT_NUM: |
504 | return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
505 | default: |
506 | *nextTokPtr = ptr; |
507 | return XML_TOK_INVALID; |
508 | } |
509 | while (ptr != end) { |
510 | switch (BYTE_TYPE(enc, ptr)) { |
511 | CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) |
512 | case BT_SEMI: |
513 | *nextTokPtr = ptr + MINBPC(enc); |
514 | return XML_TOK_ENTITY_REF; |
515 | default: |
516 | *nextTokPtr = ptr; |
517 | return XML_TOK_INVALID; |
518 | } |
519 | } |
520 | return XML_TOK_PARTIAL; |
521 | } |
522 | |
523 | /* ptr points to character following first character of attribute name */ |
524 | |
525 | static int PTRCALL |
526 | PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, |
527 | const char **nextTokPtr) |
528 | { |
529 | #ifdef XML_NS |
530 | int hadColon = 0; |
531 | #endif |
532 | while (ptr != end) { |
533 | switch (BYTE_TYPE(enc, ptr)) { |
534 | CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) |
535 | #ifdef XML_NS |
536 | case BT_COLON: |
537 | if (hadColon) { |
538 | *nextTokPtr = ptr; |
539 | return XML_TOK_INVALID; |
540 | } |
541 | hadColon = 1; |
542 | ptr += MINBPC(enc); |
543 | if (ptr == end) |
544 | return XML_TOK_PARTIAL; |
545 | switch (BYTE_TYPE(enc, ptr)) { |
546 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
547 | default: |
548 | *nextTokPtr = ptr; |
549 | return XML_TOK_INVALID; |
550 | } |
551 | break; |
552 | #endif |
553 | case BT_S: case BT_CR: case BT_LF: |
554 | for (;;) { |
555 | int t; |
556 | |
557 | ptr += MINBPC(enc); |
558 | if (ptr == end) |
559 | return XML_TOK_PARTIAL; |
560 | t = BYTE_TYPE(enc, ptr); |
561 | if (t == BT_EQUALS) |
562 | break; |
563 | switch (t) { |
564 | case BT_S: |
565 | case BT_LF: |
566 | case BT_CR: |
567 | break; |
568 | default: |
569 | *nextTokPtr = ptr; |
570 | return XML_TOK_INVALID; |
571 | } |
572 | } |
573 | /* fall through */ |
574 | case BT_EQUALS: |
575 | { |
576 | int open; |
577 | #ifdef XML_NS |
578 | hadColon = 0; |
579 | #endif |
580 | for (;;) { |
581 | ptr += MINBPC(enc); |
582 | if (ptr == end) |
583 | return XML_TOK_PARTIAL; |
584 | open = BYTE_TYPE(enc, ptr); |
585 | if (open == BT_QUOT || open == BT_APOS) |
586 | break; |
587 | switch (open) { |
588 | case BT_S: |
589 | case BT_LF: |
590 | case BT_CR: |
591 | break; |
592 | default: |
593 | *nextTokPtr = ptr; |
594 | return XML_TOK_INVALID; |
595 | } |
596 | } |
597 | ptr += MINBPC(enc); |
598 | /* in attribute value */ |
599 | for (;;) { |
600 | int t; |
601 | if (ptr == end) |
602 | return XML_TOK_PARTIAL; |
603 | t = BYTE_TYPE(enc, ptr); |
604 | if (t == open) |
605 | break; |
606 | switch (t) { |
607 | INVALID_CASES(ptr, nextTokPtr) |
608 | case BT_AMP: |
609 | { |
610 | int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr); |
611 | if (tok <= 0) { |
612 | if (tok == XML_TOK_INVALID) |
613 | *nextTokPtr = ptr; |
614 | return tok; |
615 | } |
616 | break; |
617 | } |
618 | case BT_LT: |
619 | *nextTokPtr = ptr; |
620 | return XML_TOK_INVALID; |
621 | default: |
622 | ptr += MINBPC(enc); |
623 | break; |
624 | } |
625 | } |
626 | ptr += MINBPC(enc); |
627 | if (ptr == end) |
628 | return XML_TOK_PARTIAL; |
629 | switch (BYTE_TYPE(enc, ptr)) { |
630 | case BT_S: |
631 | case BT_CR: |
632 | case BT_LF: |
633 | break; |
634 | case BT_SOL: |
635 | goto sol; |
636 | case BT_GT: |
637 | goto gt; |
638 | default: |
639 | *nextTokPtr = ptr; |
640 | return XML_TOK_INVALID; |
641 | } |
642 | /* ptr points to closing quote */ |
643 | for (;;) { |
644 | ptr += MINBPC(enc); |
645 | if (ptr == end) |
646 | return XML_TOK_PARTIAL; |
647 | switch (BYTE_TYPE(enc, ptr)) { |
648 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
649 | case BT_S: case BT_CR: case BT_LF: |
650 | continue; |
651 | case BT_GT: |
652 | gt: |
653 | *nextTokPtr = ptr + MINBPC(enc); |
654 | return XML_TOK_START_TAG_WITH_ATTS; |
655 | case BT_SOL: |
656 | sol: |
657 | ptr += MINBPC(enc); |
658 | if (ptr == end) |
659 | return XML_TOK_PARTIAL; |
660 | if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { |
661 | *nextTokPtr = ptr; |
662 | return XML_TOK_INVALID; |
663 | } |
664 | *nextTokPtr = ptr + MINBPC(enc); |
665 | return XML_TOK_EMPTY_ELEMENT_WITH_ATTS; |
666 | default: |
667 | *nextTokPtr = ptr; |
668 | return XML_TOK_INVALID; |
669 | } |
670 | break; |
671 | } |
672 | break; |
673 | } |
674 | default: |
675 | *nextTokPtr = ptr; |
676 | return XML_TOK_INVALID; |
677 | } |
678 | } |
679 | return XML_TOK_PARTIAL; |
680 | } |
681 | |
682 | /* ptr points to character following "<" */ |
683 | |
684 | static int PTRCALL |
685 | PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, |
686 | const char **nextTokPtr) |
687 | { |
688 | #ifdef XML_NS |
689 | int hadColon; |
690 | #endif |
691 | if (ptr == end) |
692 | return XML_TOK_PARTIAL; |
693 | switch (BYTE_TYPE(enc, ptr)) { |
694 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
695 | case BT_EXCL: |
696 | if ((ptr += MINBPC(enc)) == end) |
697 | return XML_TOK_PARTIAL; |
698 | switch (BYTE_TYPE(enc, ptr)) { |
699 | case BT_MINUS: |
700 | return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
701 | case BT_LSQB: |
702 | return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), |
703 | end, nextTokPtr); |
704 | } |
705 | *nextTokPtr = ptr; |
706 | return XML_TOK_INVALID; |
707 | case BT_QUEST: |
708 | return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
709 | case BT_SOL: |
710 | return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
711 | default: |
712 | *nextTokPtr = ptr; |
713 | return XML_TOK_INVALID; |
714 | } |
715 | #ifdef XML_NS |
716 | hadColon = 0; |
717 | #endif |
718 | /* we have a start-tag */ |
719 | while (ptr != end) { |
720 | switch (BYTE_TYPE(enc, ptr)) { |
721 | CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) |
722 | #ifdef XML_NS |
723 | case BT_COLON: |
724 | if (hadColon) { |
725 | *nextTokPtr = ptr; |
726 | return XML_TOK_INVALID; |
727 | } |
728 | hadColon = 1; |
729 | ptr += MINBPC(enc); |
730 | if (ptr == end) |
731 | return XML_TOK_PARTIAL; |
732 | switch (BYTE_TYPE(enc, ptr)) { |
733 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
734 | default: |
735 | *nextTokPtr = ptr; |
736 | return XML_TOK_INVALID; |
737 | } |
738 | break; |
739 | #endif |
740 | case BT_S: case BT_CR: case BT_LF: |
741 | { |
742 | ptr += MINBPC(enc); |
743 | while (ptr != end) { |
744 | switch (BYTE_TYPE(enc, ptr)) { |
745 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
746 | case BT_GT: |
747 | goto gt; |
748 | case BT_SOL: |
749 | goto sol; |
750 | case BT_S: case BT_CR: case BT_LF: |
751 | ptr += MINBPC(enc); |
752 | continue; |
753 | default: |
754 | *nextTokPtr = ptr; |
755 | return XML_TOK_INVALID; |
756 | } |
757 | return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr); |
758 | } |
759 | return XML_TOK_PARTIAL; |
760 | } |
761 | case BT_GT: |
762 | gt: |
763 | *nextTokPtr = ptr + MINBPC(enc); |
764 | return XML_TOK_START_TAG_NO_ATTS; |
765 | case BT_SOL: |
766 | sol: |
767 | ptr += MINBPC(enc); |
768 | if (ptr == end) |
769 | return XML_TOK_PARTIAL; |
770 | if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { |
771 | *nextTokPtr = ptr; |
772 | return XML_TOK_INVALID; |
773 | } |
774 | *nextTokPtr = ptr + MINBPC(enc); |
775 | return XML_TOK_EMPTY_ELEMENT_NO_ATTS; |
776 | default: |
777 | *nextTokPtr = ptr; |
778 | return XML_TOK_INVALID; |
779 | } |
780 | } |
781 | return XML_TOK_PARTIAL; |
782 | } |
783 | |
784 | static int PTRCALL |
785 | PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end, |
786 | const char **nextTokPtr) |
787 | { |
788 | if (ptr == end) |
789 | return XML_TOK_NONE; |
790 | if (MINBPC(enc) > 1) { |
791 | size_t n = end - ptr; |
792 | if (n & (MINBPC(enc) - 1)) { |
793 | n &= ~(MINBPC(enc) - 1); |
794 | if (n == 0) |
795 | return XML_TOK_PARTIAL; |
796 | end = ptr + n; |
797 | } |
798 | } |
799 | switch (BYTE_TYPE(enc, ptr)) { |
800 | case BT_LT: |
801 | return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
802 | case BT_AMP: |
803 | return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
804 | case BT_CR: |
805 | ptr += MINBPC(enc); |
806 | if (ptr == end) |
807 | return XML_TOK_TRAILING_CR; |
808 | if (BYTE_TYPE(enc, ptr) == BT_LF) |
809 | ptr += MINBPC(enc); |
810 | *nextTokPtr = ptr; |
811 | return XML_TOK_DATA_NEWLINE; |
812 | case BT_LF: |
813 | *nextTokPtr = ptr + MINBPC(enc); |
814 | return XML_TOK_DATA_NEWLINE; |
815 | case BT_RSQB: |
816 | ptr += MINBPC(enc); |
817 | if (ptr == end) |
818 | return XML_TOK_TRAILING_RSQB; |
819 | if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB)) |
820 | break; |
821 | ptr += MINBPC(enc); |
822 | if (ptr == end) |
823 | return XML_TOK_TRAILING_RSQB; |
824 | if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { |
825 | ptr -= MINBPC(enc); |
826 | break; |
827 | } |
828 | *nextTokPtr = ptr; |
829 | return XML_TOK_INVALID; |
830 | INVALID_CASES(ptr, nextTokPtr) |
831 | default: |
832 | ptr += MINBPC(enc); |
833 | break; |
834 | } |
835 | while (ptr != end) { |
836 | switch (BYTE_TYPE(enc, ptr)) { |
837 | #define LEAD_CASE(n) \ |
838 | case BT_LEAD ## n: \ |
839 | if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ |
840 | *nextTokPtr = ptr; \ |
841 | return XML_TOK_DATA_CHARS; \ |
842 | } \ |
843 | ptr += n; \ |
844 | break; |
845 | LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) |
846 | #undef LEAD_CASE |
847 | case BT_RSQB: |
848 | if (ptr + MINBPC(enc) != end) { |
849 | if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) { |
850 | ptr += MINBPC(enc); |
851 | break; |
852 | } |
853 | if (ptr + 2*MINBPC(enc) != end) { |
854 | if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) { |
855 | ptr += MINBPC(enc); |
856 | break; |
857 | } |
858 | *nextTokPtr = ptr + 2*MINBPC(enc); |
859 | return XML_TOK_INVALID; |
860 | } |
861 | } |
862 | /* fall through */ |
863 | case BT_AMP: |
864 | case BT_LT: |
865 | case BT_NONXML: |
866 | case BT_MALFORM: |
867 | case BT_TRAIL: |
868 | case BT_CR: |
869 | case BT_LF: |
870 | *nextTokPtr = ptr; |
871 | return XML_TOK_DATA_CHARS; |
872 | default: |
873 | ptr += MINBPC(enc); |
874 | break; |
875 | } |
876 | } |
877 | *nextTokPtr = ptr; |
878 | return XML_TOK_DATA_CHARS; |
879 | } |
880 | |
881 | /* ptr points to character following "%" */ |
882 | |
883 | static int PTRCALL |
884 | PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end, |
885 | const char **nextTokPtr) |
886 | { |
887 | if (ptr == end) |
888 | return XML_TOK_PARTIAL; |
889 | switch (BYTE_TYPE(enc, ptr)) { |
890 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
891 | case BT_S: case BT_LF: case BT_CR: case BT_PERCNT: |
892 | *nextTokPtr = ptr; |
893 | return XML_TOK_PERCENT; |
894 | default: |
895 | *nextTokPtr = ptr; |
896 | return XML_TOK_INVALID; |
897 | } |
898 | while (ptr != end) { |
899 | switch (BYTE_TYPE(enc, ptr)) { |
900 | CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) |
901 | case BT_SEMI: |
902 | *nextTokPtr = ptr + MINBPC(enc); |
903 | return XML_TOK_PARAM_ENTITY_REF; |
904 | default: |
905 | *nextTokPtr = ptr; |
906 | return XML_TOK_INVALID; |
907 | } |
908 | } |
909 | return XML_TOK_PARTIAL; |
910 | } |
911 | |
912 | static int PTRCALL |
913 | PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end, |
914 | const char **nextTokPtr) |
915 | { |
916 | if (ptr == end) |
917 | return XML_TOK_PARTIAL; |
918 | switch (BYTE_TYPE(enc, ptr)) { |
919 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
920 | default: |
921 | *nextTokPtr = ptr; |
922 | return XML_TOK_INVALID; |
923 | } |
924 | while (ptr != end) { |
925 | switch (BYTE_TYPE(enc, ptr)) { |
926 | CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) |
927 | case BT_CR: case BT_LF: case BT_S: |
928 | case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR: |
929 | *nextTokPtr = ptr; |
930 | return XML_TOK_POUND_NAME; |
931 | default: |
932 | *nextTokPtr = ptr; |
933 | return XML_TOK_INVALID; |
934 | } |
935 | } |
936 | return -XML_TOK_POUND_NAME; |
937 | } |
938 | |
939 | static int PTRCALL |
940 | PREFIX(scanLit)(int open, const ENCODING *enc, |
941 | const char *ptr, const char *end, |
942 | const char **nextTokPtr) |
943 | { |
944 | while (ptr != end) { |
945 | int t = BYTE_TYPE(enc, ptr); |
946 | switch (t) { |
947 | INVALID_CASES(ptr, nextTokPtr) |
948 | case BT_QUOT: |
949 | case BT_APOS: |
950 | ptr += MINBPC(enc); |
951 | if (t != open) |
952 | break; |
953 | if (ptr == end) |
954 | return -XML_TOK_LITERAL; |
955 | *nextTokPtr = ptr; |
956 | switch (BYTE_TYPE(enc, ptr)) { |
957 | case BT_S: case BT_CR: case BT_LF: |
958 | case BT_GT: case BT_PERCNT: case BT_LSQB: |
959 | return XML_TOK_LITERAL; |
960 | default: |
961 | return XML_TOK_INVALID; |
962 | } |
963 | default: |
964 | ptr += MINBPC(enc); |
965 | break; |
966 | } |
967 | } |
968 | return XML_TOK_PARTIAL; |
969 | } |
970 | |
971 | static int PTRCALL |
972 | PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, |
973 | const char **nextTokPtr) |
974 | { |
975 | int tok; |
976 | if (ptr == end) |
977 | return XML_TOK_NONE; |
978 | if (MINBPC(enc) > 1) { |
979 | size_t n = end - ptr; |
980 | if (n & (MINBPC(enc) - 1)) { |
981 | n &= ~(MINBPC(enc) - 1); |
982 | if (n == 0) |
983 | return XML_TOK_PARTIAL; |
984 | end = ptr + n; |
985 | } |
986 | } |
987 | switch (BYTE_TYPE(enc, ptr)) { |
988 | case BT_QUOT: |
989 | return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr); |
990 | case BT_APOS: |
991 | return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr); |
992 | case BT_LT: |
993 | { |
994 | ptr += MINBPC(enc); |
995 | if (ptr == end) |
996 | return XML_TOK_PARTIAL; |
997 | switch (BYTE_TYPE(enc, ptr)) { |
998 | case BT_EXCL: |
999 | return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
1000 | case BT_QUEST: |
1001 | return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
1002 | case BT_NMSTRT: |
1003 | case BT_HEX: |
1004 | case BT_NONASCII: |
1005 | case BT_LEAD2: |
1006 | case BT_LEAD3: |
1007 | case BT_LEAD4: |
1008 | *nextTokPtr = ptr - MINBPC(enc); |
1009 | return XML_TOK_INSTANCE_START; |
1010 | } |
1011 | *nextTokPtr = ptr; |
1012 | return XML_TOK_INVALID; |
1013 | } |
1014 | case BT_CR: |
1015 | if (ptr + MINBPC(enc) == end) { |
1016 | *nextTokPtr = end; |
1017 | /* indicate that this might be part of a CR/LF pair */ |
1018 | return -XML_TOK_PROLOG_S; |
1019 | } |
1020 | /* fall through */ |
1021 | case BT_S: case BT_LF: |
1022 | for (;;) { |
1023 | ptr += MINBPC(enc); |
1024 | if (ptr == end) |
1025 | break; |
1026 | switch (BYTE_TYPE(enc, ptr)) { |
1027 | case BT_S: case BT_LF: |
1028 | break; |
1029 | case BT_CR: |
1030 | /* don't split CR/LF pair */ |
1031 | if (ptr + MINBPC(enc) != end) |
1032 | break; |
1033 | /* fall through */ |
1034 | default: |
1035 | *nextTokPtr = ptr; |
1036 | return XML_TOK_PROLOG_S; |
1037 | } |
1038 | } |
1039 | *nextTokPtr = ptr; |
1040 | return XML_TOK_PROLOG_S; |
1041 | case BT_PERCNT: |
1042 | return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
1043 | case BT_COMMA: |
1044 | *nextTokPtr = ptr + MINBPC(enc); |
1045 | return XML_TOK_COMMA; |
1046 | case BT_LSQB: |
1047 | *nextTokPtr = ptr + MINBPC(enc); |
1048 | return XML_TOK_OPEN_BRACKET; |
1049 | case BT_RSQB: |
1050 | ptr += MINBPC(enc); |
1051 | if (ptr == end) |
1052 | return -XML_TOK_CLOSE_BRACKET; |
1053 | if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { |
1054 | if (ptr + MINBPC(enc) == end) |
1055 | return XML_TOK_PARTIAL; |
1056 | if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) { |
1057 | *nextTokPtr = ptr + 2*MINBPC(enc); |
1058 | return XML_TOK_COND_SECT_CLOSE; |
1059 | } |
1060 | } |
1061 | *nextTokPtr = ptr; |
1062 | return XML_TOK_CLOSE_BRACKET; |
1063 | case BT_LPAR: |
1064 | *nextTokPtr = ptr + MINBPC(enc); |
1065 | return XML_TOK_OPEN_PAREN; |
1066 | case BT_RPAR: |
1067 | ptr += MINBPC(enc); |
1068 | if (ptr == end) |
1069 | return -XML_TOK_CLOSE_PAREN; |
1070 | switch (BYTE_TYPE(enc, ptr)) { |
1071 | case BT_AST: |
1072 | *nextTokPtr = ptr + MINBPC(enc); |
1073 | return XML_TOK_CLOSE_PAREN_ASTERISK; |
1074 | case BT_QUEST: |
1075 | *nextTokPtr = ptr + MINBPC(enc); |
1076 | return XML_TOK_CLOSE_PAREN_QUESTION; |
1077 | case BT_PLUS: |
1078 | *nextTokPtr = ptr + MINBPC(enc); |
1079 | return XML_TOK_CLOSE_PAREN_PLUS; |
1080 | case BT_CR: case BT_LF: case BT_S: |
1081 | case BT_GT: case BT_COMMA: case BT_VERBAR: |
1082 | case BT_RPAR: |
1083 | *nextTokPtr = ptr; |
1084 | return XML_TOK_CLOSE_PAREN; |
1085 | } |
1086 | *nextTokPtr = ptr; |
1087 | return XML_TOK_INVALID; |
1088 | case BT_VERBAR: |
1089 | *nextTokPtr = ptr + MINBPC(enc); |
1090 | return XML_TOK_OR; |
1091 | case BT_GT: |
1092 | *nextTokPtr = ptr + MINBPC(enc); |
1093 | return XML_TOK_DECL_CLOSE; |
1094 | case BT_NUM: |
1095 | return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
1096 | #define LEAD_CASE(n) \ |
1097 | case BT_LEAD ## n: \ |
1098 | if (end - ptr < n) \ |
1099 | return XML_TOK_PARTIAL_CHAR; \ |
1100 | if (IS_NMSTRT_CHAR(enc, ptr, n)) { \ |
1101 | ptr += n; \ |
1102 | tok = XML_TOK_NAME; \ |
1103 | break; \ |
1104 | } \ |
1105 | if (IS_NAME_CHAR(enc, ptr, n)) { \ |
1106 | ptr += n; \ |
1107 | tok = XML_TOK_NMTOKEN; \ |
1108 | break; \ |
1109 | } \ |
1110 | *nextTokPtr = ptr; \ |
1111 | return XML_TOK_INVALID; |
1112 | LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) |
1113 | #undef LEAD_CASE |
1114 | case BT_NMSTRT: |
1115 | case BT_HEX: |
1116 | tok = XML_TOK_NAME; |
1117 | ptr += MINBPC(enc); |
1118 | break; |
1119 | case BT_DIGIT: |
1120 | case BT_NAME: |
1121 | case BT_MINUS: |
1122 | #ifdef XML_NS |
1123 | case BT_COLON: |
1124 | #endif |
1125 | tok = XML_TOK_NMTOKEN; |
1126 | ptr += MINBPC(enc); |
1127 | break; |
1128 | case BT_NONASCII: |
1129 | if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { |
1130 | ptr += MINBPC(enc); |
1131 | tok = XML_TOK_NAME; |
1132 | break; |
1133 | } |
1134 | if (IS_NAME_CHAR_MINBPC(enc, ptr)) { |
1135 | ptr += MINBPC(enc); |
1136 | tok = XML_TOK_NMTOKEN; |
1137 | break; |
1138 | } |
1139 | /* fall through */ |
1140 | default: |
1141 | *nextTokPtr = ptr; |
1142 | return XML_TOK_INVALID; |
1143 | } |
1144 | while (ptr != end) { |
1145 | switch (BYTE_TYPE(enc, ptr)) { |
1146 | CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) |
1147 | case BT_GT: case BT_RPAR: case BT_COMMA: |
1148 | case BT_VERBAR: case BT_LSQB: case BT_PERCNT: |
1149 | case BT_S: case BT_CR: case BT_LF: |
1150 | *nextTokPtr = ptr; |
1151 | return tok; |
1152 | #ifdef XML_NS |
1153 | case BT_COLON: |
1154 | ptr += MINBPC(enc); |
1155 | switch (tok) { |
1156 | case XML_TOK_NAME: |
1157 | if (ptr == end) |
1158 | return XML_TOK_PARTIAL; |
1159 | tok = XML_TOK_PREFIXED_NAME; |
1160 | switch (BYTE_TYPE(enc, ptr)) { |
1161 | CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) |
1162 | default: |
1163 | tok = XML_TOK_NMTOKEN; |
1164 | break; |
1165 | } |
1166 | break; |
1167 | case XML_TOK_PREFIXED_NAME: |
1168 | tok = XML_TOK_NMTOKEN; |
1169 | break; |
1170 | } |
1171 | break; |
1172 | #endif |
1173 | case BT_PLUS: |
1174 | if (tok == XML_TOK_NMTOKEN) { |
1175 | *nextTokPtr = ptr; |
1176 | return XML_TOK_INVALID; |
1177 | } |
1178 | *nextTokPtr = ptr + MINBPC(enc); |
1179 | return XML_TOK_NAME_PLUS; |
1180 | case BT_AST: |
1181 | if (tok == XML_TOK_NMTOKEN) { |
1182 | *nextTokPtr = ptr; |
1183 | return XML_TOK_INVALID; |
1184 | } |
1185 | *nextTokPtr = ptr + MINBPC(enc); |
1186 | return XML_TOK_NAME_ASTERISK; |
1187 | case BT_QUEST: |
1188 | if (tok == XML_TOK_NMTOKEN) { |
1189 | *nextTokPtr = ptr; |
1190 | return XML_TOK_INVALID; |
1191 | } |
1192 | *nextTokPtr = ptr + MINBPC(enc); |
1193 | return XML_TOK_NAME_QUESTION; |
1194 | default: |
1195 | *nextTokPtr = ptr; |
1196 | return XML_TOK_INVALID; |
1197 | } |
1198 | } |
1199 | return -tok; |
1200 | } |
1201 | |
1202 | static int PTRCALL |
1203 | PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, |
1204 | const char *end, const char **nextTokPtr) |
1205 | { |
1206 | const char *start; |
1207 | if (ptr == end) |
1208 | return XML_TOK_NONE; |
1209 | start = ptr; |
1210 | while (ptr != end) { |
1211 | switch (BYTE_TYPE(enc, ptr)) { |
1212 | #define LEAD_CASE(n) \ |
1213 | case BT_LEAD ## n: ptr += n; break; |
1214 | LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) |
1215 | #undef LEAD_CASE |
1216 | case BT_AMP: |
1217 | if (ptr == start) |
1218 | return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
1219 | *nextTokPtr = ptr; |
1220 | return XML_TOK_DATA_CHARS; |
1221 | case BT_LT: |
1222 | /* this is for inside entity references */ |
1223 | *nextTokPtr = ptr; |
1224 | return XML_TOK_INVALID; |
1225 | case BT_LF: |
1226 | if (ptr == start) { |
1227 | *nextTokPtr = ptr + MINBPC(enc); |
1228 | return XML_TOK_DATA_NEWLINE; |
1229 | } |
1230 | *nextTokPtr = ptr; |
1231 | return XML_TOK_DATA_CHARS; |
1232 | case BT_CR: |
1233 | if (ptr == start) { |
1234 | ptr += MINBPC(enc); |
1235 | if (ptr == end) |
1236 | return XML_TOK_TRAILING_CR; |
1237 | if (BYTE_TYPE(enc, ptr) == BT_LF) |
1238 | ptr += MINBPC(enc); |
1239 | *nextTokPtr = ptr; |
1240 | return XML_TOK_DATA_NEWLINE; |
1241 | } |
1242 | *nextTokPtr = ptr; |
1243 | return XML_TOK_DATA_CHARS; |
1244 | case BT_S: |
1245 | if (ptr == start) { |
1246 | *nextTokPtr = ptr + MINBPC(enc); |
1247 | return XML_TOK_ATTRIBUTE_VALUE_S; |
1248 | } |
1249 | *nextTokPtr = ptr; |
1250 | return XML_TOK_DATA_CHARS; |
1251 | default: |
1252 | ptr += MINBPC(enc); |
1253 | break; |
1254 | } |
1255 | } |
1256 | *nextTokPtr = ptr; |
1257 | return XML_TOK_DATA_CHARS; |
1258 | } |
1259 | |
1260 | static int PTRCALL |
1261 | PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, |
1262 | const char *end, const char **nextTokPtr) |
1263 | { |
1264 | const char *start; |
1265 | if (ptr == end) |
1266 | return XML_TOK_NONE; |
1267 | start = ptr; |
1268 | while (ptr != end) { |
1269 | switch (BYTE_TYPE(enc, ptr)) { |
1270 | #define LEAD_CASE(n) \ |
1271 | case BT_LEAD ## n: ptr += n; break; |
1272 | LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) |
1273 | #undef LEAD_CASE |
1274 | case BT_AMP: |
1275 | if (ptr == start) |
1276 | return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
1277 | *nextTokPtr = ptr; |
1278 | return XML_TOK_DATA_CHARS; |
1279 | case BT_PERCNT: |
1280 | if (ptr == start) { |
1281 | int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), |
1282 | end, nextTokPtr); |
1283 | return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok; |
1284 | } |
1285 | *nextTokPtr = ptr; |
1286 | return XML_TOK_DATA_CHARS; |
1287 | case BT_LF: |
1288 | if (ptr == start) { |
1289 | *nextTokPtr = ptr + MINBPC(enc); |
1290 | return XML_TOK_DATA_NEWLINE; |
1291 | } |
1292 | *nextTokPtr = ptr; |
1293 | return XML_TOK_DATA_CHARS; |
1294 | case BT_CR: |
1295 | if (ptr == start) { |
1296 | ptr += MINBPC(enc); |
1297 | if (ptr == end) |
1298 | return XML_TOK_TRAILING_CR; |
1299 | if (BYTE_TYPE(enc, ptr) == BT_LF) |
1300 | ptr += MINBPC(enc); |
1301 | *nextTokPtr = ptr; |
1302 | return XML_TOK_DATA_NEWLINE; |
1303 | } |
1304 | *nextTokPtr = ptr; |
1305 | return XML_TOK_DATA_CHARS; |
1306 | default: |
1307 | ptr += MINBPC(enc); |
1308 | break; |
1309 | } |
1310 | } |
1311 | *nextTokPtr = ptr; |
1312 | return XML_TOK_DATA_CHARS; |
1313 | } |
1314 | |
1315 | #ifdef XML_DTD |
1316 | |
1317 | static int PTRCALL |
1318 | PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, |
1319 | const char *end, const char **nextTokPtr) |
1320 | { |
1321 | int level = 0; |
1322 | if (MINBPC(enc) > 1) { |
1323 | size_t n = end - ptr; |
1324 | if (n & (MINBPC(enc) - 1)) { |
1325 | n &= ~(MINBPC(enc) - 1); |
1326 | end = ptr + n; |
1327 | } |
1328 | } |
1329 | while (ptr != end) { |
1330 | switch (BYTE_TYPE(enc, ptr)) { |
1331 | INVALID_CASES(ptr, nextTokPtr) |
1332 | case BT_LT: |
1333 | if ((ptr += MINBPC(enc)) == end) |
1334 | return XML_TOK_PARTIAL; |
1335 | if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) { |
1336 | if ((ptr += MINBPC(enc)) == end) |
1337 | return XML_TOK_PARTIAL; |
1338 | if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) { |
1339 | ++level; |
1340 | ptr += MINBPC(enc); |
1341 | } |
1342 | } |
1343 | break; |
1344 | case BT_RSQB: |
1345 | if ((ptr += MINBPC(enc)) == end) |
1346 | return XML_TOK_PARTIAL; |
1347 | if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { |
1348 | if ((ptr += MINBPC(enc)) == end) |
1349 | return XML_TOK_PARTIAL; |
1350 | if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { |
1351 | ptr += MINBPC(enc); |
1352 | if (level == 0) { |
1353 | *nextTokPtr = ptr; |
1354 | return XML_TOK_IGNORE_SECT; |
1355 | } |
1356 | --level; |
1357 | } |
1358 | } |
1359 | break; |
1360 | default: |
1361 | ptr += MINBPC(enc); |
1362 | break; |
1363 | } |
1364 | } |
1365 | return XML_TOK_PARTIAL; |
1366 | } |
1367 | |
1368 | #endif /* XML_DTD */ |
1369 | |
1370 | static int PTRCALL |
1371 | PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end, |
1372 | const char **badPtr) |
1373 | { |
1374 | ptr += MINBPC(enc); |
1375 | end -= MINBPC(enc); |
1376 | for (; ptr != end; ptr += MINBPC(enc)) { |
1377 | switch (BYTE_TYPE(enc, ptr)) { |
1378 | case BT_DIGIT: |
1379 | case BT_HEX: |
1380 | case BT_MINUS: |
1381 | case BT_APOS: |
1382 | case BT_LPAR: |
1383 | case BT_RPAR: |
1384 | case BT_PLUS: |
1385 | case BT_COMMA: |
1386 | case BT_SOL: |
1387 | case BT_EQUALS: |
1388 | case BT_QUEST: |
1389 | case BT_CR: |
1390 | case BT_LF: |
1391 | case BT_SEMI: |
1392 | case BT_EXCL: |
1393 | case BT_AST: |
1394 | case BT_PERCNT: |
1395 | case BT_NUM: |
1396 | #ifdef XML_NS |
1397 | case BT_COLON: |
1398 | #endif |
1399 | break; |
1400 | case BT_S: |
1401 | if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) { |
1402 | *badPtr = ptr; |
1403 | return 0; |
1404 | } |
1405 | break; |
1406 | case BT_NAME: |
1407 | case BT_NMSTRT: |
1408 | if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f)) |
1409 | break; |
1410 | default: |
1411 | switch (BYTE_TO_ASCII(enc, ptr)) { |
1412 | case 0x24: /* $ */ |
1413 | case 0x40: /* @ */ |
1414 | break; |
1415 | default: |
1416 | *badPtr = ptr; |
1417 | return 0; |
1418 | } |
1419 | break; |
1420 | } |
1421 | } |
1422 | return 1; |
1423 | } |
1424 | |
1425 | /* This must only be called for a well-formed start-tag or empty |
1426 | element tag. Returns the number of attributes. Pointers to the |
1427 | first attsMax attributes are stored in atts. |
1428 | */ |
1429 | |
1430 | static int PTRCALL |
1431 | PREFIX(getAtts)(const ENCODING *enc, const char *ptr, |
1432 | int attsMax, ATTRIBUTE *atts) |
1433 | { |
1434 | enum { other, inName, inValue } state = inName; |
1435 | int nAtts = 0; |
1436 | int open = 0; /* defined when state == inValue; |
1437 | initialization just to shut up compilers */ |
1438 | |
1439 | for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) { |
1440 | switch (BYTE_TYPE(enc, ptr)) { |
1441 | #define START_NAME \ |
1442 | if (state == other) { \ |
1443 | if (nAtts < attsMax) { \ |
1444 | atts[nAtts].name = ptr; \ |
1445 | atts[nAtts].normalized = 1; \ |
1446 | } \ |
1447 | state = inName; \ |
1448 | } |
1449 | #define LEAD_CASE(n) \ |
1450 | case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break; |
1451 | LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) |
1452 | #undef LEAD_CASE |
1453 | case BT_NONASCII: |
1454 | case BT_NMSTRT: |
1455 | case BT_HEX: |
1456 | START_NAME |
1457 | break; |
1458 | #undef START_NAME |
1459 | case BT_QUOT: |
1460 | if (state != inValue) { |
1461 | if (nAtts < attsMax) |
1462 | atts[nAtts].valuePtr = ptr + MINBPC(enc); |
1463 | state = inValue; |
1464 | open = BT_QUOT; |
1465 | } |
1466 | else if (open == BT_QUOT) { |
1467 | state = other; |
1468 | if (nAtts < attsMax) |
1469 | atts[nAtts].valueEnd = ptr; |
1470 | nAtts++; |
1471 | } |
1472 | break; |
1473 | case BT_APOS: |
1474 | if (state != inValue) { |
1475 | if (nAtts < attsMax) |
1476 | atts[nAtts].valuePtr = ptr + MINBPC(enc); |
1477 | state = inValue; |
1478 | open = BT_APOS; |
1479 | } |
1480 | else if (open == BT_APOS) { |
1481 | state = other; |
1482 | if (nAtts < attsMax) |
1483 | atts[nAtts].valueEnd = ptr; |
1484 | nAtts++; |
1485 | } |
1486 | break; |
1487 | case BT_AMP: |
1488 | if (nAtts < attsMax) |
1489 | atts[nAtts].normalized = 0; |
1490 | break; |
1491 | case BT_S: |
1492 | if (state == inName) |
1493 | state = other; |
1494 | else if (state == inValue |
1495 | && nAtts < attsMax |
1496 | && atts[nAtts].normalized |
1497 | && (ptr == atts[nAtts].valuePtr |
1498 | || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE |
1499 | || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE |
1500 | || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open)) |
1501 | atts[nAtts].normalized = 0; |
1502 | break; |
1503 | case BT_CR: case BT_LF: |
1504 | /* This case ensures that the first attribute name is counted |
1505 | Apart from that we could just change state on the quote. */ |
1506 | if (state == inName) |
1507 | state = other; |
1508 | else if (state == inValue && nAtts < attsMax) |
1509 | atts[nAtts].normalized = 0; |
1510 | break; |
1511 | case BT_GT: |
1512 | case BT_SOL: |
1513 | if (state != inValue) |
1514 | return nAtts; |
1515 | break; |
1516 | default: |
1517 | break; |
1518 | } |
1519 | } |
1520 | /* not reached */ |
1521 | } |
1522 | |
1523 | static int PTRFASTCALL |
1524 | PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) |
1525 | { |
1526 | int result = 0; |
1527 | /* skip &# */ |
1528 | ptr += 2*MINBPC(enc); |
1529 | if (CHAR_MATCHES(enc, ptr, ASCII_x)) { |
1530 | for (ptr += MINBPC(enc); |
1531 | !CHAR_MATCHES(enc, ptr, ASCII_SEMI); |
1532 | ptr += MINBPC(enc)) { |
1533 | int c = BYTE_TO_ASCII(enc, ptr); |
1534 | switch (c) { |
1535 | case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4: |
1536 | case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9: |
1537 | result <<= 4; |
1538 | result |= (c - ASCII_0); |
1539 | break; |
1540 | case ASCII_A: case ASCII_B: case ASCII_C: |
1541 | case ASCII_D: case ASCII_E: case ASCII_F: |
1542 | result <<= 4; |
1543 | result += 10 + (c - ASCII_A); |
1544 | break; |
1545 | case ASCII_a: case ASCII_b: case ASCII_c: |
1546 | case ASCII_d: case ASCII_e: case ASCII_f: |
1547 | result <<= 4; |
1548 | result += 10 + (c - ASCII_a); |
1549 | break; |
1550 | } |
1551 | if (result >= 0x110000) |
1552 | return -1; |
1553 | } |
1554 | } |
1555 | else { |
1556 | for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) { |
1557 | int c = BYTE_TO_ASCII(enc, ptr); |
1558 | result *= 10; |
1559 | result += (c - ASCII_0); |
1560 | if (result >= 0x110000) |
1561 | return -1; |
1562 | } |
1563 | } |
1564 | return checkCharRefNumber(result); |
1565 | } |
1566 | |
1567 | static int PTRCALL |
1568 | PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, |
1569 | const char *end) |
1570 | { |
1571 | switch ((end - ptr)/MINBPC(enc)) { |
1572 | case 2: |
1573 | if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) { |
1574 | switch (BYTE_TO_ASCII(enc, ptr)) { |
1575 | case ASCII_l: |
1576 | return ASCII_LT; |
1577 | case ASCII_g: |
1578 | return ASCII_GT; |
1579 | } |
1580 | } |
1581 | break; |
1582 | case 3: |
1583 | if (CHAR_MATCHES(enc, ptr, ASCII_a)) { |
1584 | ptr += MINBPC(enc); |
1585 | if (CHAR_MATCHES(enc, ptr, ASCII_m)) { |
1586 | ptr += MINBPC(enc); |
1587 | if (CHAR_MATCHES(enc, ptr, ASCII_p)) |
1588 | return ASCII_AMP; |
1589 | } |
1590 | } |
1591 | break; |
1592 | case 4: |
1593 | switch (BYTE_TO_ASCII(enc, ptr)) { |
1594 | case ASCII_q: |
1595 | ptr += MINBPC(enc); |
1596 | if (CHAR_MATCHES(enc, ptr, ASCII_u)) { |
1597 | ptr += MINBPC(enc); |
1598 | if (CHAR_MATCHES(enc, ptr, ASCII_o)) { |
1599 | ptr += MINBPC(enc); |
1600 | if (CHAR_MATCHES(enc, ptr, ASCII_t)) |
1601 | return ASCII_QUOT; |
1602 | } |
1603 | } |
1604 | break; |
1605 | case ASCII_a: |
1606 | ptr += MINBPC(enc); |
1607 | if (CHAR_MATCHES(enc, ptr, ASCII_p)) { |
1608 | ptr += MINBPC(enc); |
1609 | if (CHAR_MATCHES(enc, ptr, ASCII_o)) { |
1610 | ptr += MINBPC(enc); |
1611 | if (CHAR_MATCHES(enc, ptr, ASCII_s)) |
1612 | return ASCII_APOS; |
1613 | } |
1614 | } |
1615 | break; |
1616 | } |
1617 | } |
1618 | return 0; |
1619 | } |
1620 | |
1621 | static int PTRCALL |
1622 | PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2) |
1623 | { |
1624 | for (;;) { |
1625 | switch (BYTE_TYPE(enc, ptr1)) { |
1626 | #define LEAD_CASE(n) \ |
1627 | case BT_LEAD ## n: \ |
1628 | if (*ptr1++ != *ptr2++) \ |
1629 | return 0; |
1630 | LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2) |
1631 | #undef LEAD_CASE |
1632 | /* fall through */ |
1633 | if (*ptr1++ != *ptr2++) |
1634 | return 0; |
1635 | break; |
1636 | case BT_NONASCII: |
1637 | case BT_NMSTRT: |
1638 | #ifdef XML_NS |
1639 | case BT_COLON: |
1640 | #endif |
1641 | case BT_HEX: |
1642 | case BT_DIGIT: |
1643 | case BT_NAME: |
1644 | case BT_MINUS: |
1645 | if (*ptr2++ != *ptr1++) |
1646 | return 0; |
1647 | if (MINBPC(enc) > 1) { |
1648 | if (*ptr2++ != *ptr1++) |
1649 | return 0; |
1650 | if (MINBPC(enc) > 2) { |
1651 | if (*ptr2++ != *ptr1++) |
1652 | return 0; |
1653 | if (MINBPC(enc) > 3) { |
1654 | if (*ptr2++ != *ptr1++) |
1655 | return 0; |
1656 | } |
1657 | } |
1658 | } |
1659 | break; |
1660 | default: |
1661 | if (MINBPC(enc) == 1 && *ptr1 == *ptr2) |
1662 | return 1; |
1663 | switch (BYTE_TYPE(enc, ptr2)) { |
1664 | case BT_LEAD2: |
1665 | case BT_LEAD3: |
1666 | case BT_LEAD4: |
1667 | case BT_NONASCII: |
1668 | case BT_NMSTRT: |
1669 | #ifdef XML_NS |
1670 | case BT_COLON: |
1671 | #endif |
1672 | case BT_HEX: |
1673 | case BT_DIGIT: |
1674 | case BT_NAME: |
1675 | case BT_MINUS: |
1676 | return 0; |
1677 | default: |
1678 | return 1; |
1679 | } |
1680 | } |
1681 | } |
1682 | /* not reached */ |
1683 | } |
1684 | |
1685 | static int PTRCALL |
1686 | PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1, |
1687 | const char *end1, const char *ptr2) |
1688 | { |
1689 | for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) { |
1690 | if (ptr1 == end1) |
1691 | return 0; |
1692 | if (!CHAR_MATCHES(enc, ptr1, *ptr2)) |
1693 | return 0; |
1694 | } |
1695 | return ptr1 == end1; |
1696 | } |
1697 | |
1698 | static int PTRFASTCALL |
1699 | PREFIX(nameLength)(const ENCODING *enc, const char *ptr) |
1700 | { |
1701 | const char *start = ptr; |
1702 | for (;;) { |
1703 | switch (BYTE_TYPE(enc, ptr)) { |
1704 | #define LEAD_CASE(n) \ |
1705 | case BT_LEAD ## n: ptr += n; break; |
1706 | LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) |
1707 | #undef LEAD_CASE |
1708 | case BT_NONASCII: |
1709 | case BT_NMSTRT: |
1710 | #ifdef XML_NS |
1711 | case BT_COLON: |
1712 | #endif |
1713 | case BT_HEX: |
1714 | case BT_DIGIT: |
1715 | case BT_NAME: |
1716 | case BT_MINUS: |
1717 | ptr += MINBPC(enc); |
1718 | break; |
1719 | default: |
1720 | return (int)(ptr - start); |
1721 | } |
1722 | } |
1723 | } |
1724 | |
1725 | static const char * PTRFASTCALL |
1726 | PREFIX(skipS)(const ENCODING *enc, const char *ptr) |
1727 | { |
1728 | for (;;) { |
1729 | switch (BYTE_TYPE(enc, ptr)) { |
1730 | case BT_LF: |
1731 | case BT_CR: |
1732 | case BT_S: |
1733 | ptr += MINBPC(enc); |
1734 | break; |
1735 | default: |
1736 | return ptr; |
1737 | } |
1738 | } |
1739 | } |
1740 | |
1741 | static void PTRCALL |
1742 | PREFIX(updatePosition)(const ENCODING *enc, |
1743 | const char *ptr, |
1744 | const char *end, |
1745 | POSITION *pos) |
1746 | { |
1747 | while (ptr < end) { |
1748 | switch (BYTE_TYPE(enc, ptr)) { |
1749 | #define LEAD_CASE(n) \ |
1750 | case BT_LEAD ## n: \ |
1751 | ptr += n; \ |
1752 | break; |
1753 | LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) |
1754 | #undef LEAD_CASE |
1755 | case BT_LF: |
1756 | pos->columnNumber = (XML_Size)-1; |
1757 | pos->lineNumber++; |
1758 | ptr += MINBPC(enc); |
1759 | break; |
1760 | case BT_CR: |
1761 | pos->lineNumber++; |
1762 | ptr += MINBPC(enc); |
1763 | if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF) |
1764 | ptr += MINBPC(enc); |
1765 | pos->columnNumber = (XML_Size)-1; |
1766 | break; |
1767 | default: |
1768 | ptr += MINBPC(enc); |
1769 | break; |
1770 | } |
1771 | pos->columnNumber++; |
1772 | } |
1773 | } |
1774 | |
1775 | #undef DO_LEAD_CASE |
1776 | #undef MULTIBYTE_CASES |
1777 | #undef INVALID_CASES |
1778 | #undef CHECK_NAME_CASE |
1779 | #undef CHECK_NAME_CASES |
1780 | #undef CHECK_NMSTRT_CASE |
1781 | #undef CHECK_NMSTRT_CASES |
1782 | |
1783 | #endif /* XML_TOK_IMPL_C */ |
1784 | |