1 | /* |
2 | www.sourceforge.net/projects/tinyxml |
3 | Original code by Lee Thomason (www.grinninglizard.com) |
4 | |
5 | This software is provided 'as-is', without any express or implied |
6 | warranty. In no event will the authors be held liable for any |
7 | damages arising from the use of this software. |
8 | |
9 | Permission is granted to anyone to use this software for any |
10 | purpose, including commercial applications, and to alter it and |
11 | redistribute it freely, subject to the following restrictions: |
12 | |
13 | 1. The origin of this software must not be misrepresented; you must |
14 | not claim that you wrote the original software. If you use this |
15 | software in a product, an acknowledgment in the product documentation |
16 | would be appreciated but is not required. |
17 | |
18 | 2. Altered source versions must be plainly marked as such, and |
19 | must not be misrepresented as being the original software. |
20 | |
21 | 3. This notice may not be removed or altered from any source |
22 | distribution. |
23 | */ |
24 | |
25 | #include <ctype.h> |
26 | #include <stddef.h> |
27 | |
28 | #include "tinyxml.h" |
29 | |
30 | //#define DEBUG_PARSER |
31 | #if defined( DEBUG_PARSER ) |
32 | # if defined( DEBUG ) && defined( _MSC_VER ) |
33 | # include <windows.h> |
34 | # define TIXML_LOG OutputDebugString |
35 | # else |
36 | # define TIXML_LOG printf |
37 | # endif |
38 | #endif |
39 | |
40 | // Note tha "PutString" hardcodes the same list. This |
41 | // is less flexible than it appears. Changing the entries |
42 | // or order will break putstring. |
43 | TiXmlBase::Entity TiXmlBase::entity[ TiXmlBase::NUM_ENTITY ] = |
44 | { |
45 | { "&" , 5, '&' }, |
46 | { "<" , 4, '<' }, |
47 | { ">" , 4, '>' }, |
48 | { """ , 6, '\"' }, |
49 | { "'" , 6, '\'' } |
50 | }; |
51 | |
52 | // Bunch of unicode info at: |
53 | // http://www.unicode.org/faq/utf_bom.html |
54 | // Including the basic of this table, which determines the #bytes in the |
55 | // sequence from the lead byte. 1 placed for invalid sequences -- |
56 | // although the result will be junk, pass it through as much as possible. |
57 | // Beware of the non-characters in UTF-8: |
58 | // ef bb bf (Microsoft "lead bytes") |
59 | // ef bf be |
60 | // ef bf bf |
61 | |
62 | const unsigned char TIXML_UTF_LEAD_0 = 0xefU; |
63 | const unsigned char TIXML_UTF_LEAD_1 = 0xbbU; |
64 | const unsigned char TIXML_UTF_LEAD_2 = 0xbfU; |
65 | |
66 | const int TiXmlBase::utf8ByteTable[256] = |
67 | { |
68 | // 0 1 2 3 4 5 6 7 8 9 a b c d e f |
69 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00 |
70 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10 |
71 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20 |
72 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30 |
73 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40 |
74 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50 |
75 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60 |
76 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range |
77 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid |
78 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90 |
79 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0 |
80 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0 |
81 | 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte |
82 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0 |
83 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 0xe0 to 0xef 3 byte |
84 | 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid |
85 | }; |
86 | |
87 | |
88 | void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length ) |
89 | { |
90 | const unsigned long BYTE_MASK = 0xBF; |
91 | const unsigned long BYTE_MARK = 0x80; |
92 | const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; |
93 | |
94 | if (input < 0x80) |
95 | *length = 1; |
96 | else if ( input < 0x800 ) |
97 | *length = 2; |
98 | else if ( input < 0x10000 ) |
99 | *length = 3; |
100 | else if ( input < 0x200000 ) |
101 | *length = 4; |
102 | else |
103 | { *length = 0; return; } // This code won't covert this correctly anyway. |
104 | |
105 | output += *length; |
106 | |
107 | // Scary scary fall throughs. |
108 | switch (*length) |
109 | { |
110 | case 4: |
111 | --output; |
112 | *output = (char)((input | BYTE_MARK) & BYTE_MASK); |
113 | input >>= 6; |
114 | case 3: |
115 | --output; |
116 | *output = (char)((input | BYTE_MARK) & BYTE_MASK); |
117 | input >>= 6; |
118 | case 2: |
119 | --output; |
120 | *output = (char)((input | BYTE_MARK) & BYTE_MASK); |
121 | input >>= 6; |
122 | case 1: |
123 | --output; |
124 | *output = (char)(input | FIRST_BYTE_MARK[*length]); |
125 | } |
126 | } |
127 | |
128 | |
129 | /*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ ) |
130 | { |
131 | // This will only work for low-ascii, everything else is assumed to be a valid |
132 | // letter. I'm not sure this is the best approach, but it is quite tricky trying |
133 | // to figure out alhabetical vs. not across encoding. So take a very |
134 | // conservative approach. |
135 | |
136 | // if ( encoding == TIXML_ENCODING_UTF8 ) |
137 | // { |
138 | if ( anyByte < 127 ) |
139 | return isalpha( anyByte ); |
140 | else |
141 | return 1; // What else to do? The unicode set is huge...get the english ones right. |
142 | // } |
143 | // else |
144 | // { |
145 | // return isalpha( anyByte ); |
146 | // } |
147 | } |
148 | |
149 | |
150 | /*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ ) |
151 | { |
152 | // This will only work for low-ascii, everything else is assumed to be a valid |
153 | // letter. I'm not sure this is the best approach, but it is quite tricky trying |
154 | // to figure out alhabetical vs. not across encoding. So take a very |
155 | // conservative approach. |
156 | |
157 | // if ( encoding == TIXML_ENCODING_UTF8 ) |
158 | // { |
159 | if ( anyByte < 127 ) |
160 | return isalnum( anyByte ); |
161 | else |
162 | return 1; // What else to do? The unicode set is huge...get the english ones right. |
163 | // } |
164 | // else |
165 | // { |
166 | // return isalnum( anyByte ); |
167 | // } |
168 | } |
169 | |
170 | |
171 | class TiXmlParsingData |
172 | { |
173 | friend class TiXmlDocument; |
174 | public: |
175 | void Stamp( const char* now, TiXmlEncoding encoding ); |
176 | |
177 | const TiXmlCursor& Cursor() const { return cursor; } |
178 | |
179 | private: |
180 | // Only used by the document! |
181 | TiXmlParsingData( const char* start, int _tabsize, int row, int col ) |
182 | { |
183 | assert( start ); |
184 | stamp = start; |
185 | tabsize = _tabsize; |
186 | cursor.row = row; |
187 | cursor.col = col; |
188 | } |
189 | |
190 | TiXmlCursor cursor; |
191 | const char* stamp; |
192 | int tabsize; |
193 | }; |
194 | |
195 | |
196 | void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding ) |
197 | { |
198 | assert( now ); |
199 | |
200 | // Do nothing if the tabsize is 0. |
201 | if ( tabsize < 1 ) |
202 | { |
203 | return; |
204 | } |
205 | |
206 | // Get the current row, column. |
207 | int row = cursor.row; |
208 | int col = cursor.col; |
209 | const char* p = stamp; |
210 | assert( p ); |
211 | |
212 | while ( p < now ) |
213 | { |
214 | // Treat p as unsigned, so we have a happy compiler. |
215 | const unsigned char* pU = (const unsigned char*)p; |
216 | |
217 | // Code contributed by Fletcher Dunn: (modified by lee) |
218 | switch (*pU) { |
219 | case 0: |
220 | // We *should* never get here, but in case we do, don't |
221 | // advance past the terminating null character, ever |
222 | return; |
223 | |
224 | case '\r': |
225 | // bump down to the next line |
226 | ++row; |
227 | col = 0; |
228 | // Eat the character |
229 | ++p; |
230 | |
231 | // Check for \r\n sequence, and treat this as a single character |
232 | if (*p == '\n') { |
233 | ++p; |
234 | } |
235 | break; |
236 | |
237 | case '\n': |
238 | // bump down to the next line |
239 | ++row; |
240 | col = 0; |
241 | |
242 | // Eat the character |
243 | ++p; |
244 | |
245 | // Check for \n\r sequence, and treat this as a single |
246 | // character. (Yes, this bizarre thing does occur still |
247 | // on some arcane platforms...) |
248 | if (*p == '\r') { |
249 | ++p; |
250 | } |
251 | break; |
252 | |
253 | case '\t': |
254 | // Eat the character |
255 | ++p; |
256 | |
257 | // Skip to next tab stop |
258 | col = (col / tabsize + 1) * tabsize; |
259 | break; |
260 | |
261 | case TIXML_UTF_LEAD_0: |
262 | if ( encoding == TIXML_ENCODING_UTF8 ) |
263 | { |
264 | if ( *(p+1) && *(p+2) ) |
265 | { |
266 | // In these cases, don't advance the column. These are |
267 | // 0-width spaces. |
268 | if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 ) |
269 | p += 3; |
270 | else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU ) |
271 | p += 3; |
272 | else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU ) |
273 | p += 3; |
274 | else |
275 | { p +=3; ++col; } // A normal character. |
276 | } |
277 | } |
278 | else |
279 | { |
280 | ++p; |
281 | ++col; |
282 | } |
283 | break; |
284 | |
285 | default: |
286 | if ( encoding == TIXML_ENCODING_UTF8 ) |
287 | { |
288 | // Eat the 1 to 4 byte utf8 character. |
289 | int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)]; |
290 | if ( step == 0 ) |
291 | step = 1; // Error case from bad encoding, but handle gracefully. |
292 | p += step; |
293 | |
294 | // Just advance one column, of course. |
295 | ++col; |
296 | } |
297 | else |
298 | { |
299 | ++p; |
300 | ++col; |
301 | } |
302 | break; |
303 | } |
304 | } |
305 | cursor.row = row; |
306 | cursor.col = col; |
307 | assert( cursor.row >= -1 ); |
308 | assert( cursor.col >= -1 ); |
309 | stamp = p; |
310 | assert( stamp ); |
311 | } |
312 | |
313 | |
314 | const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding ) |
315 | { |
316 | if ( !p || !*p ) |
317 | { |
318 | return 0; |
319 | } |
320 | if ( encoding == TIXML_ENCODING_UTF8 ) |
321 | { |
322 | while ( *p ) |
323 | { |
324 | const unsigned char* pU = (const unsigned char*)p; |
325 | |
326 | // Skip the stupid Microsoft UTF-8 Byte order marks |
327 | if ( *(pU+0)==TIXML_UTF_LEAD_0 |
328 | && *(pU+1)==TIXML_UTF_LEAD_1 |
329 | && *(pU+2)==TIXML_UTF_LEAD_2 ) |
330 | { |
331 | p += 3; |
332 | continue; |
333 | } |
334 | else if(*(pU+0)==TIXML_UTF_LEAD_0 |
335 | && *(pU+1)==0xbfU |
336 | && *(pU+2)==0xbeU ) |
337 | { |
338 | p += 3; |
339 | continue; |
340 | } |
341 | else if(*(pU+0)==TIXML_UTF_LEAD_0 |
342 | && *(pU+1)==0xbfU |
343 | && *(pU+2)==0xbfU ) |
344 | { |
345 | p += 3; |
346 | continue; |
347 | } |
348 | |
349 | if ( IsWhiteSpace( *p ) ) // Still using old rules for white space. |
350 | ++p; |
351 | else |
352 | break; |
353 | } |
354 | } |
355 | else |
356 | { |
357 | while ( *p && IsWhiteSpace( *p ) ) |
358 | ++p; |
359 | } |
360 | |
361 | return p; |
362 | } |
363 | |
364 | #ifdef TIXML_USE_STL |
365 | /*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag ) |
366 | { |
367 | for( ;; ) |
368 | { |
369 | if ( !in->good() ) return false; |
370 | |
371 | int c = in->peek(); |
372 | // At this scope, we can't get to a document. So fail silently. |
373 | if ( !IsWhiteSpace( c ) || c <= 0 ) |
374 | return true; |
375 | |
376 | *tag += (char) in->get(); |
377 | } |
378 | } |
379 | |
380 | /*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag ) |
381 | { |
382 | //assert( character > 0 && character < 128 ); // else it won't work in utf-8 |
383 | while ( in->good() ) |
384 | { |
385 | int c = in->peek(); |
386 | if ( c == character ) |
387 | return true; |
388 | if ( c <= 0 ) // Silent failure: can't get document at this scope |
389 | return false; |
390 | |
391 | in->get(); |
392 | *tag += (char) c; |
393 | } |
394 | return false; |
395 | } |
396 | #endif |
397 | |
398 | // One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The |
399 | // "assign" optimization removes over 10% of the execution time. |
400 | // |
401 | const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding ) |
402 | { |
403 | // Oddly, not supported on some comilers, |
404 | //name->clear(); |
405 | // So use this: |
406 | *name = "" ; |
407 | assert( p ); |
408 | |
409 | // Names start with letters or underscores. |
410 | // Of course, in unicode, tinyxml has no idea what a letter *is*. The |
411 | // algorithm is generous. |
412 | // |
413 | // After that, they can be letters, underscores, numbers, |
414 | // hyphens, or colons. (Colons are valid ony for namespaces, |
415 | // but tinyxml can't tell namespaces from names.) |
416 | if ( p && *p |
417 | && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) ) |
418 | { |
419 | const char* start = p; |
420 | while( p && *p |
421 | && ( IsAlphaNum( (unsigned char ) *p, encoding ) |
422 | || *p == '_' |
423 | || *p == '-' |
424 | || *p == '.' |
425 | || *p == ':' ) ) |
426 | { |
427 | //(*name) += *p; // expensive |
428 | ++p; |
429 | } |
430 | if ( p-start > 0 ) { |
431 | name->assign( start, p-start ); |
432 | } |
433 | return p; |
434 | } |
435 | return 0; |
436 | } |
437 | |
438 | const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding ) |
439 | { |
440 | // Presume an entity, and pull it out. |
441 | TIXML_STRING ent; |
442 | int i; |
443 | *length = 0; |
444 | |
445 | if ( *(p+1) && *(p+1) == '#' && *(p+2) ) |
446 | { |
447 | unsigned long ucs = 0; |
448 | ptrdiff_t delta = 0; |
449 | unsigned mult = 1; |
450 | |
451 | if ( *(p+2) == 'x' ) |
452 | { |
453 | // Hexadecimal. |
454 | if ( !*(p+3) ) return 0; |
455 | |
456 | const char* q = p+3; |
457 | q = strchr( q, ';' ); |
458 | |
459 | if ( !q || !*q ) return 0; |
460 | |
461 | delta = q-p; |
462 | --q; |
463 | |
464 | while ( *q != 'x' ) |
465 | { |
466 | if ( *q >= '0' && *q <= '9' ) |
467 | ucs += mult * (*q - '0'); |
468 | else if ( *q >= 'a' && *q <= 'f' ) |
469 | ucs += mult * (*q - 'a' + 10); |
470 | else if ( *q >= 'A' && *q <= 'F' ) |
471 | ucs += mult * (*q - 'A' + 10 ); |
472 | else |
473 | return 0; |
474 | mult *= 16; |
475 | --q; |
476 | } |
477 | } |
478 | else |
479 | { |
480 | // Decimal. |
481 | if ( !*(p+2) ) return 0; |
482 | |
483 | const char* q = p+2; |
484 | q = strchr( q, ';' ); |
485 | |
486 | if ( !q || !*q ) return 0; |
487 | |
488 | delta = q-p; |
489 | --q; |
490 | |
491 | while ( *q != '#' ) |
492 | { |
493 | if ( *q >= '0' && *q <= '9' ) |
494 | ucs += mult * (*q - '0'); |
495 | else |
496 | return 0; |
497 | mult *= 10; |
498 | --q; |
499 | } |
500 | } |
501 | if ( encoding == TIXML_ENCODING_UTF8 ) |
502 | { |
503 | // convert the UCS to UTF-8 |
504 | ConvertUTF32ToUTF8( ucs, value, length ); |
505 | } |
506 | else |
507 | { |
508 | *value = (char)ucs; |
509 | *length = 1; |
510 | } |
511 | return p + delta + 1; |
512 | } |
513 | |
514 | // Now try to match it. |
515 | for( i=0; i<NUM_ENTITY; ++i ) |
516 | { |
517 | if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 ) |
518 | { |
519 | assert( strlen( entity[i].str ) == entity[i].strLength ); |
520 | *value = entity[i].chr; |
521 | *length = 1; |
522 | return ( p + entity[i].strLength ); |
523 | } |
524 | } |
525 | |
526 | // So it wasn't an entity, its unrecognized, or something like that. |
527 | *value = *p; // Don't put back the last one, since we return it! |
528 | //*length = 1; // Leave unrecognized entities - this doesn't really work. |
529 | // Just writes strange XML. |
530 | return p+1; |
531 | } |
532 | |
533 | |
534 | bool TiXmlBase::StringEqual( const char* p, |
535 | const char* tag, |
536 | bool ignoreCase, |
537 | TiXmlEncoding encoding ) |
538 | { |
539 | assert( p ); |
540 | assert( tag ); |
541 | if ( !p || !*p ) |
542 | { |
543 | assert( 0 ); |
544 | return false; |
545 | } |
546 | |
547 | const char* q = p; |
548 | |
549 | if ( ignoreCase ) |
550 | { |
551 | while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) ) |
552 | { |
553 | ++q; |
554 | ++tag; |
555 | } |
556 | |
557 | if ( *tag == 0 ) |
558 | return true; |
559 | } |
560 | else |
561 | { |
562 | while ( *q && *tag && *q == *tag ) |
563 | { |
564 | ++q; |
565 | ++tag; |
566 | } |
567 | |
568 | if ( *tag == 0 ) // Have we found the end of the tag, and everything equal? |
569 | return true; |
570 | } |
571 | return false; |
572 | } |
573 | |
574 | const char* TiXmlBase::ReadText( const char* p, |
575 | TIXML_STRING * text, |
576 | bool trimWhiteSpace, |
577 | const char* endTag, |
578 | bool caseInsensitive, |
579 | TiXmlEncoding encoding ) |
580 | { |
581 | *text = "" ; |
582 | if ( !trimWhiteSpace // certain tags always keep whitespace |
583 | || !condenseWhiteSpace ) // if true, whitespace is always kept |
584 | { |
585 | // Keep all the white space. |
586 | while ( p && *p |
587 | && !StringEqual( p, endTag, caseInsensitive, encoding ) |
588 | ) |
589 | { |
590 | int len; |
591 | char cArr[4] = { 0, 0, 0, 0 }; |
592 | p = GetChar( p, cArr, &len, encoding ); |
593 | text->append( cArr, len ); |
594 | } |
595 | } |
596 | else |
597 | { |
598 | bool whitespace = false; |
599 | |
600 | // Remove leading white space: |
601 | p = SkipWhiteSpace( p, encoding ); |
602 | while ( p && *p |
603 | && !StringEqual( p, endTag, caseInsensitive, encoding ) ) |
604 | { |
605 | if ( *p == '\r' || *p == '\n' ) |
606 | { |
607 | whitespace = true; |
608 | ++p; |
609 | } |
610 | else if ( IsWhiteSpace( *p ) ) |
611 | { |
612 | whitespace = true; |
613 | ++p; |
614 | } |
615 | else |
616 | { |
617 | // If we've found whitespace, add it before the |
618 | // new character. Any whitespace just becomes a space. |
619 | if ( whitespace ) |
620 | { |
621 | (*text) += ' '; |
622 | whitespace = false; |
623 | } |
624 | int len; |
625 | char cArr[4] = { 0, 0, 0, 0 }; |
626 | p = GetChar( p, cArr, &len, encoding ); |
627 | if ( len == 1 ) |
628 | (*text) += cArr[0]; // more efficient |
629 | else |
630 | text->append( cArr, len ); |
631 | } |
632 | } |
633 | } |
634 | if ( p && *p ) |
635 | p += strlen( endTag ); |
636 | return ( p && *p ) ? p : 0; |
637 | } |
638 | |
639 | #ifdef TIXML_USE_STL |
640 | |
641 | void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag ) |
642 | { |
643 | // The basic issue with a document is that we don't know what we're |
644 | // streaming. Read something presumed to be a tag (and hope), then |
645 | // identify it, and call the appropriate stream method on the tag. |
646 | // |
647 | // This "pre-streaming" will never read the closing ">" so the |
648 | // sub-tag can orient itself. |
649 | |
650 | if ( !StreamTo( in, '<', tag ) ) |
651 | { |
652 | SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); |
653 | return; |
654 | } |
655 | |
656 | while ( in->good() ) |
657 | { |
658 | int tagIndex = (int) tag->length(); |
659 | while ( in->good() && in->peek() != '>' ) |
660 | { |
661 | int c = in->get(); |
662 | if ( c <= 0 ) |
663 | { |
664 | SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); |
665 | break; |
666 | } |
667 | (*tag) += (char) c; |
668 | } |
669 | |
670 | if ( in->good() ) |
671 | { |
672 | // We now have something we presume to be a node of |
673 | // some sort. Identify it, and call the node to |
674 | // continue streaming. |
675 | TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING ); |
676 | |
677 | if ( node ) |
678 | { |
679 | node->StreamIn( in, tag ); |
680 | bool isElement = node->ToElement() != 0; |
681 | delete node; |
682 | node = 0; |
683 | |
684 | // If this is the root element, we're done. Parsing will be |
685 | // done by the >> operator. |
686 | if ( isElement ) |
687 | { |
688 | return; |
689 | } |
690 | } |
691 | else |
692 | { |
693 | SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN ); |
694 | return; |
695 | } |
696 | } |
697 | } |
698 | // We should have returned sooner. |
699 | SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN ); |
700 | } |
701 | |
702 | #endif |
703 | |
704 | const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding ) |
705 | { |
706 | ClearError(); |
707 | |
708 | // Parse away, at the document level. Since a document |
709 | // contains nothing but other tags, most of what happens |
710 | // here is skipping white space. |
711 | if ( !p || !*p ) |
712 | { |
713 | SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); |
714 | return 0; |
715 | } |
716 | |
717 | // Note that, for a document, this needs to come |
718 | // before the while space skip, so that parsing |
719 | // starts from the pointer we are given. |
720 | location.Clear(); |
721 | if ( prevData ) |
722 | { |
723 | location.row = prevData->cursor.row; |
724 | location.col = prevData->cursor.col; |
725 | } |
726 | else |
727 | { |
728 | location.row = 0; |
729 | location.col = 0; |
730 | } |
731 | TiXmlParsingData data( p, TabSize(), location.row, location.col ); |
732 | location = data.Cursor(); |
733 | |
734 | if ( encoding == TIXML_ENCODING_UNKNOWN ) |
735 | { |
736 | // Check for the Microsoft UTF-8 lead bytes. |
737 | const unsigned char* pU = (const unsigned char*)p; |
738 | if ( *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0 |
739 | && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1 |
740 | && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 ) |
741 | { |
742 | encoding = TIXML_ENCODING_UTF8; |
743 | useMicrosoftBOM = true; |
744 | } |
745 | } |
746 | |
747 | p = SkipWhiteSpace( p, encoding ); |
748 | if ( !p ) |
749 | { |
750 | SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); |
751 | return 0; |
752 | } |
753 | |
754 | while ( p && *p ) |
755 | { |
756 | TiXmlNode* node = Identify( p, encoding ); |
757 | if ( node ) |
758 | { |
759 | p = node->Parse( p, &data, encoding ); |
760 | LinkEndChild( node ); |
761 | } |
762 | else |
763 | { |
764 | break; |
765 | } |
766 | |
767 | // Did we get encoding info? |
768 | if ( encoding == TIXML_ENCODING_UNKNOWN |
769 | && node->ToDeclaration() ) |
770 | { |
771 | TiXmlDeclaration* dec = node->ToDeclaration(); |
772 | const char* enc = dec->Encoding(); |
773 | assert( enc ); |
774 | |
775 | if ( *enc == 0 ) |
776 | encoding = TIXML_ENCODING_UTF8; |
777 | else if ( StringEqual( enc, "UTF-8" , true, TIXML_ENCODING_UNKNOWN ) ) |
778 | encoding = TIXML_ENCODING_UTF8; |
779 | else if ( StringEqual( enc, "UTF8" , true, TIXML_ENCODING_UNKNOWN ) ) |
780 | encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice |
781 | else |
782 | encoding = TIXML_ENCODING_LEGACY; |
783 | } |
784 | |
785 | p = SkipWhiteSpace( p, encoding ); |
786 | } |
787 | |
788 | // Was this empty? |
789 | if ( !firstChild ) { |
790 | SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding ); |
791 | return 0; |
792 | } |
793 | |
794 | // All is well. |
795 | return p; |
796 | } |
797 | |
798 | void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding ) |
799 | { |
800 | // The first error in a chain is more accurate - don't set again! |
801 | if ( error ) |
802 | return; |
803 | |
804 | assert( err > 0 && err < TIXML_ERROR_STRING_COUNT ); |
805 | error = true; |
806 | errorId = err; |
807 | errorDesc = errorString[ errorId ]; |
808 | |
809 | errorLocation.Clear(); |
810 | if ( pError && data ) |
811 | { |
812 | data->Stamp( pError, encoding ); |
813 | errorLocation = data->Cursor(); |
814 | } |
815 | } |
816 | |
817 | |
818 | TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding ) |
819 | { |
820 | TiXmlNode* returnNode = 0; |
821 | |
822 | p = SkipWhiteSpace( p, encoding ); |
823 | if( !p || !*p || *p != '<' ) |
824 | { |
825 | return 0; |
826 | } |
827 | |
828 | p = SkipWhiteSpace( p, encoding ); |
829 | |
830 | if ( !p || !*p ) |
831 | { |
832 | return 0; |
833 | } |
834 | |
835 | // What is this thing? |
836 | // - Elements start with a letter or underscore, but xml is reserved. |
837 | // - Comments: <!-- |
838 | // - Decleration: <?xml |
839 | // - Everthing else is unknown to tinyxml. |
840 | // |
841 | |
842 | const char* = { "<?xml" }; |
843 | const char* = { "<!--" }; |
844 | const char* = { "<!" }; |
845 | const char* = { "<![CDATA[" }; |
846 | |
847 | if ( StringEqual( p, xmlHeader, true, encoding ) ) |
848 | { |
849 | #ifdef DEBUG_PARSER |
850 | TIXML_LOG( "XML parsing Declaration\n" ); |
851 | #endif |
852 | returnNode = new TiXmlDeclaration(); |
853 | } |
854 | else if ( StringEqual( p, commentHeader, false, encoding ) ) |
855 | { |
856 | #ifdef DEBUG_PARSER |
857 | TIXML_LOG( "XML parsing Comment\n" ); |
858 | #endif |
859 | returnNode = new TiXmlComment(); |
860 | } |
861 | else if ( StringEqual( p, cdataHeader, false, encoding ) ) |
862 | { |
863 | #ifdef DEBUG_PARSER |
864 | TIXML_LOG( "XML parsing CDATA\n" ); |
865 | #endif |
866 | TiXmlText* text = new TiXmlText( "" ); |
867 | text->SetCDATA( true ); |
868 | returnNode = text; |
869 | } |
870 | else if ( StringEqual( p, dtdHeader, false, encoding ) ) |
871 | { |
872 | #ifdef DEBUG_PARSER |
873 | TIXML_LOG( "XML parsing Unknown(1)\n" ); |
874 | #endif |
875 | returnNode = new TiXmlUnknown(); |
876 | } |
877 | else if ( IsAlpha( *(p+1), encoding ) |
878 | || *(p+1) == '_' ) |
879 | { |
880 | #ifdef DEBUG_PARSER |
881 | TIXML_LOG( "XML parsing Element\n" ); |
882 | #endif |
883 | returnNode = new TiXmlElement( "" ); |
884 | } |
885 | else |
886 | { |
887 | #ifdef DEBUG_PARSER |
888 | TIXML_LOG( "XML parsing Unknown(2)\n" ); |
889 | #endif |
890 | returnNode = new TiXmlUnknown(); |
891 | } |
892 | |
893 | if ( returnNode ) |
894 | { |
895 | // Set the parent, so it can report errors |
896 | returnNode->parent = this; |
897 | } |
898 | return returnNode; |
899 | } |
900 | |
901 | #ifdef TIXML_USE_STL |
902 | |
903 | void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag) |
904 | { |
905 | // We're called with some amount of pre-parsing. That is, some of "this" |
906 | // element is in "tag". Go ahead and stream to the closing ">" |
907 | while( in->good() ) |
908 | { |
909 | int c = in->get(); |
910 | if ( c <= 0 ) |
911 | { |
912 | TiXmlDocument* document = GetDocument(); |
913 | if ( document ) |
914 | document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); |
915 | return; |
916 | } |
917 | (*tag) += (char) c ; |
918 | |
919 | if ( c == '>' ) |
920 | break; |
921 | } |
922 | |
923 | if ( tag->length() < 3 ) return; |
924 | |
925 | // Okay...if we are a "/>" tag, then we're done. We've read a complete tag. |
926 | // If not, identify and stream. |
927 | |
928 | if ( tag->at( tag->length() - 1 ) == '>' |
929 | && tag->at( tag->length() - 2 ) == '/' ) |
930 | { |
931 | // All good! |
932 | return; |
933 | } |
934 | else if ( tag->at( tag->length() - 1 ) == '>' ) |
935 | { |
936 | // There is more. Could be: |
937 | // text |
938 | // cdata text (which looks like another node) |
939 | // closing tag |
940 | // another node. |
941 | for ( ;; ) |
942 | { |
943 | StreamWhiteSpace( in, tag ); |
944 | |
945 | // Do we have text? |
946 | if ( in->good() && in->peek() != '<' ) |
947 | { |
948 | // Yep, text. |
949 | TiXmlText text( "" ); |
950 | text.StreamIn( in, tag ); |
951 | |
952 | // What follows text is a closing tag or another node. |
953 | // Go around again and figure it out. |
954 | continue; |
955 | } |
956 | |
957 | // We now have either a closing tag...or another node. |
958 | // We should be at a "<", regardless. |
959 | if ( !in->good() ) return; |
960 | assert( in->peek() == '<' ); |
961 | int tagIndex = (int) tag->length(); |
962 | |
963 | bool closingTag = false; |
964 | bool firstCharFound = false; |
965 | |
966 | for( ;; ) |
967 | { |
968 | if ( !in->good() ) |
969 | return; |
970 | |
971 | int c = in->peek(); |
972 | if ( c <= 0 ) |
973 | { |
974 | TiXmlDocument* document = GetDocument(); |
975 | if ( document ) |
976 | document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); |
977 | return; |
978 | } |
979 | |
980 | if ( c == '>' ) |
981 | break; |
982 | |
983 | *tag += (char) c; |
984 | in->get(); |
985 | |
986 | // Early out if we find the CDATA id. |
987 | if ( c == '[' && tag->size() >= 9 ) |
988 | { |
989 | size_t len = tag->size(); |
990 | const char* start = tag->c_str() + len - 9; |
991 | if ( strcmp( start, "<![CDATA[" ) == 0 ) { |
992 | assert( !closingTag ); |
993 | break; |
994 | } |
995 | } |
996 | |
997 | if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) ) |
998 | { |
999 | firstCharFound = true; |
1000 | if ( c == '/' ) |
1001 | closingTag = true; |
1002 | } |
1003 | } |
1004 | // If it was a closing tag, then read in the closing '>' to clean up the input stream. |
1005 | // If it was not, the streaming will be done by the tag. |
1006 | if ( closingTag ) |
1007 | { |
1008 | if ( !in->good() ) |
1009 | return; |
1010 | |
1011 | int c = in->get(); |
1012 | if ( c <= 0 ) |
1013 | { |
1014 | TiXmlDocument* document = GetDocument(); |
1015 | if ( document ) |
1016 | document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); |
1017 | return; |
1018 | } |
1019 | assert( c == '>' ); |
1020 | *tag += (char) c; |
1021 | |
1022 | // We are done, once we've found our closing tag. |
1023 | return; |
1024 | } |
1025 | else |
1026 | { |
1027 | // If not a closing tag, id it, and stream. |
1028 | const char* tagloc = tag->c_str() + tagIndex; |
1029 | TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING ); |
1030 | if ( !node ) |
1031 | return; |
1032 | node->StreamIn( in, tag ); |
1033 | delete node; |
1034 | node = 0; |
1035 | |
1036 | // No return: go around from the beginning: text, closing tag, or node. |
1037 | } |
1038 | } |
1039 | } |
1040 | } |
1041 | #endif |
1042 | |
1043 | const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding ) |
1044 | { |
1045 | p = SkipWhiteSpace( p, encoding ); |
1046 | TiXmlDocument* document = GetDocument(); |
1047 | |
1048 | if ( !p || !*p ) |
1049 | { |
1050 | if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding ); |
1051 | return 0; |
1052 | } |
1053 | |
1054 | if ( data ) |
1055 | { |
1056 | data->Stamp( p, encoding ); |
1057 | location = data->Cursor(); |
1058 | } |
1059 | |
1060 | if ( *p != '<' ) |
1061 | { |
1062 | if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding ); |
1063 | return 0; |
1064 | } |
1065 | |
1066 | p = SkipWhiteSpace( p+1, encoding ); |
1067 | |
1068 | // Read the name. |
1069 | const char* pErr = p; |
1070 | |
1071 | p = ReadName( p, &value, encoding ); |
1072 | if ( !p || !*p ) |
1073 | { |
1074 | if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding ); |
1075 | return 0; |
1076 | } |
1077 | |
1078 | TIXML_STRING endTag ("</" ); |
1079 | endTag += value; |
1080 | |
1081 | // Check for and read attributes. Also look for an empty |
1082 | // tag or an end tag. |
1083 | while ( p && *p ) |
1084 | { |
1085 | pErr = p; |
1086 | p = SkipWhiteSpace( p, encoding ); |
1087 | if ( !p || !*p ) |
1088 | { |
1089 | if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding ); |
1090 | return 0; |
1091 | } |
1092 | if ( *p == '/' ) |
1093 | { |
1094 | ++p; |
1095 | // Empty tag. |
1096 | if ( *p != '>' ) |
1097 | { |
1098 | if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding ); |
1099 | return 0; |
1100 | } |
1101 | return (p+1); |
1102 | } |
1103 | else if ( *p == '>' ) |
1104 | { |
1105 | // Done with attributes (if there were any.) |
1106 | // Read the value -- which can include other |
1107 | // elements -- read the end tag, and return. |
1108 | ++p; |
1109 | p = ReadValue( p, data, encoding ); // Note this is an Element method, and will set the error if one happens. |
1110 | if ( !p || !*p ) { |
1111 | // We were looking for the end tag, but found nothing. |
1112 | // Fix for [ 1663758 ] Failure to report error on bad XML |
1113 | if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding ); |
1114 | return 0; |
1115 | } |
1116 | |
1117 | // We should find the end tag now |
1118 | // note that: |
1119 | // </foo > and |
1120 | // </foo> |
1121 | // are both valid end tags. |
1122 | if ( StringEqual( p, endTag.c_str(), false, encoding ) ) |
1123 | { |
1124 | p += endTag.length(); |
1125 | p = SkipWhiteSpace( p, encoding ); |
1126 | if ( p && *p && *p == '>' ) { |
1127 | ++p; |
1128 | return p; |
1129 | } |
1130 | if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding ); |
1131 | return 0; |
1132 | } |
1133 | else |
1134 | { |
1135 | if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding ); |
1136 | return 0; |
1137 | } |
1138 | } |
1139 | else |
1140 | { |
1141 | // Try to read an attribute: |
1142 | TiXmlAttribute* attrib = new TiXmlAttribute(); |
1143 | if ( !attrib ) |
1144 | { |
1145 | return 0; |
1146 | } |
1147 | |
1148 | attrib->SetDocument( document ); |
1149 | pErr = p; |
1150 | p = attrib->Parse( p, data, encoding ); |
1151 | |
1152 | if ( !p || !*p ) |
1153 | { |
1154 | if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding ); |
1155 | delete attrib; |
1156 | return 0; |
1157 | } |
1158 | |
1159 | // Handle the strange case of double attributes: |
1160 | #ifdef TIXML_USE_STL |
1161 | TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() ); |
1162 | #else |
1163 | TiXmlAttribute* node = attributeSet.Find( attrib->Name() ); |
1164 | #endif |
1165 | if ( node ) |
1166 | { |
1167 | if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding ); |
1168 | delete attrib; |
1169 | return 0; |
1170 | } |
1171 | |
1172 | attributeSet.Add( attrib ); |
1173 | } |
1174 | } |
1175 | return p; |
1176 | } |
1177 | |
1178 | |
1179 | const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding ) |
1180 | { |
1181 | TiXmlDocument* document = GetDocument(); |
1182 | |
1183 | // Read in text and elements in any order. |
1184 | const char* pWithWhiteSpace = p; |
1185 | p = SkipWhiteSpace( p, encoding ); |
1186 | |
1187 | while ( p && *p ) |
1188 | { |
1189 | if ( *p != '<' ) |
1190 | { |
1191 | // Take what we have, make a text element. |
1192 | TiXmlText* textNode = new TiXmlText( "" ); |
1193 | |
1194 | if ( !textNode ) |
1195 | { |
1196 | return 0; |
1197 | } |
1198 | |
1199 | if ( TiXmlBase::IsWhiteSpaceCondensed() ) |
1200 | { |
1201 | p = textNode->Parse( p, data, encoding ); |
1202 | } |
1203 | else |
1204 | { |
1205 | // Special case: we want to keep the white space |
1206 | // so that leading spaces aren't removed. |
1207 | p = textNode->Parse( pWithWhiteSpace, data, encoding ); |
1208 | } |
1209 | |
1210 | if ( !textNode->Blank() ) |
1211 | LinkEndChild( textNode ); |
1212 | else |
1213 | delete textNode; |
1214 | } |
1215 | else |
1216 | { |
1217 | // We hit a '<' |
1218 | // Have we hit a new element or an end tag? This could also be |
1219 | // a TiXmlText in the "CDATA" style. |
1220 | if ( StringEqual( p, "</" , false, encoding ) ) |
1221 | { |
1222 | return p; |
1223 | } |
1224 | else |
1225 | { |
1226 | TiXmlNode* node = Identify( p, encoding ); |
1227 | if ( node ) |
1228 | { |
1229 | p = node->Parse( p, data, encoding ); |
1230 | LinkEndChild( node ); |
1231 | } |
1232 | else |
1233 | { |
1234 | return 0; |
1235 | } |
1236 | } |
1237 | } |
1238 | pWithWhiteSpace = p; |
1239 | p = SkipWhiteSpace( p, encoding ); |
1240 | } |
1241 | |
1242 | if ( !p ) |
1243 | { |
1244 | if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding ); |
1245 | } |
1246 | return p; |
1247 | } |
1248 | |
1249 | |
1250 | #ifdef TIXML_USE_STL |
1251 | void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag ) |
1252 | { |
1253 | while ( in->good() ) |
1254 | { |
1255 | int c = in->get(); |
1256 | if ( c <= 0 ) |
1257 | { |
1258 | TiXmlDocument* document = GetDocument(); |
1259 | if ( document ) |
1260 | document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); |
1261 | return; |
1262 | } |
1263 | (*tag) += (char) c; |
1264 | |
1265 | if ( c == '>' ) |
1266 | { |
1267 | // All is well. |
1268 | return; |
1269 | } |
1270 | } |
1271 | } |
1272 | #endif |
1273 | |
1274 | |
1275 | const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding ) |
1276 | { |
1277 | TiXmlDocument* document = GetDocument(); |
1278 | p = SkipWhiteSpace( p, encoding ); |
1279 | |
1280 | if ( data ) |
1281 | { |
1282 | data->Stamp( p, encoding ); |
1283 | location = data->Cursor(); |
1284 | } |
1285 | if ( !p || !*p || *p != '<' ) |
1286 | { |
1287 | if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding ); |
1288 | return 0; |
1289 | } |
1290 | ++p; |
1291 | value = "" ; |
1292 | |
1293 | while ( p && *p && *p != '>' ) |
1294 | { |
1295 | value += *p; |
1296 | ++p; |
1297 | } |
1298 | |
1299 | if ( !p ) |
1300 | { |
1301 | if ( document ) |
1302 | document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding ); |
1303 | } |
1304 | if ( p && *p == '>' ) |
1305 | return p+1; |
1306 | return p; |
1307 | } |
1308 | |
1309 | #ifdef TIXML_USE_STL |
1310 | void TiXmlComment::( std::istream * in, TIXML_STRING * tag ) |
1311 | { |
1312 | while ( in->good() ) |
1313 | { |
1314 | int c = in->get(); |
1315 | if ( c <= 0 ) |
1316 | { |
1317 | TiXmlDocument* document = GetDocument(); |
1318 | if ( document ) |
1319 | document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); |
1320 | return; |
1321 | } |
1322 | |
1323 | (*tag) += (char) c; |
1324 | |
1325 | if ( c == '>' |
1326 | && tag->at( tag->length() - 2 ) == '-' |
1327 | && tag->at( tag->length() - 3 ) == '-' ) |
1328 | { |
1329 | // All is well. |
1330 | return; |
1331 | } |
1332 | } |
1333 | } |
1334 | #endif |
1335 | |
1336 | |
1337 | const char* TiXmlComment::( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding ) |
1338 | { |
1339 | TiXmlDocument* document = GetDocument(); |
1340 | value = "" ; |
1341 | |
1342 | p = SkipWhiteSpace( p, encoding ); |
1343 | |
1344 | if ( data ) |
1345 | { |
1346 | data->Stamp( p, encoding ); |
1347 | location = data->Cursor(); |
1348 | } |
1349 | const char* startTag = "<!--" ; |
1350 | const char* endTag = "-->" ; |
1351 | |
1352 | if ( !StringEqual( p, startTag, false, encoding ) ) |
1353 | { |
1354 | if ( document ) |
1355 | document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding ); |
1356 | return 0; |
1357 | } |
1358 | p += strlen( startTag ); |
1359 | |
1360 | // [ 1475201 ] TinyXML parses entities in comments |
1361 | // Oops - ReadText doesn't work, because we don't want to parse the entities. |
1362 | // p = ReadText( p, &value, false, endTag, false, encoding ); |
1363 | // |
1364 | // from the XML spec: |
1365 | /* |
1366 | [Definition: Comments may appear anywhere in a document outside other markup; in addition, |
1367 | they may appear within the document type declaration at places allowed by the grammar. |
1368 | They are not part of the document's character data; an XML processor MAY, but need not, |
1369 | make it possible for an application to retrieve the text of comments. For compatibility, |
1370 | the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity |
1371 | references MUST NOT be recognized within comments. |
1372 | |
1373 | An example of a comment: |
1374 | |
1375 | <!-- declarations for <head> & <body> --> |
1376 | */ |
1377 | |
1378 | value = "" ; |
1379 | // Keep all the white space. |
1380 | while ( p && *p && !StringEqual( p, endTag, false, encoding ) ) |
1381 | { |
1382 | value.append( p, 1 ); |
1383 | ++p; |
1384 | } |
1385 | if ( p && *p ) |
1386 | p += strlen( endTag ); |
1387 | |
1388 | return p; |
1389 | } |
1390 | |
1391 | |
1392 | const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding ) |
1393 | { |
1394 | p = SkipWhiteSpace( p, encoding ); |
1395 | if ( !p || !*p ) return 0; |
1396 | |
1397 | if ( data ) |
1398 | { |
1399 | data->Stamp( p, encoding ); |
1400 | location = data->Cursor(); |
1401 | } |
1402 | // Read the name, the '=' and the value. |
1403 | const char* pErr = p; |
1404 | p = ReadName( p, &name, encoding ); |
1405 | if ( !p || !*p ) |
1406 | { |
1407 | if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding ); |
1408 | return 0; |
1409 | } |
1410 | p = SkipWhiteSpace( p, encoding ); |
1411 | if ( !p || !*p || *p != '=' ) |
1412 | { |
1413 | if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding ); |
1414 | return 0; |
1415 | } |
1416 | |
1417 | ++p; // skip '=' |
1418 | p = SkipWhiteSpace( p, encoding ); |
1419 | if ( !p || !*p ) |
1420 | { |
1421 | if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding ); |
1422 | return 0; |
1423 | } |
1424 | |
1425 | const char* end; |
1426 | const char SINGLE_QUOTE = '\''; |
1427 | const char DOUBLE_QUOTE = '\"'; |
1428 | |
1429 | if ( *p == SINGLE_QUOTE ) |
1430 | { |
1431 | ++p; |
1432 | end = "\'" ; // single quote in string |
1433 | p = ReadText( p, &value, false, end, false, encoding ); |
1434 | } |
1435 | else if ( *p == DOUBLE_QUOTE ) |
1436 | { |
1437 | ++p; |
1438 | end = "\"" ; // double quote in string |
1439 | p = ReadText( p, &value, false, end, false, encoding ); |
1440 | } |
1441 | else |
1442 | { |
1443 | // All attribute values should be in single or double quotes. |
1444 | // But this is such a common error that the parser will try |
1445 | // its best, even without them. |
1446 | value = "" ; |
1447 | while ( p && *p // existence |
1448 | && !IsWhiteSpace( *p ) // whitespace |
1449 | && *p != '/' && *p != '>' ) // tag end |
1450 | { |
1451 | if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) { |
1452 | // [ 1451649 ] Attribute values with trailing quotes not handled correctly |
1453 | // We did not have an opening quote but seem to have a |
1454 | // closing one. Give up and throw an error. |
1455 | if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding ); |
1456 | return 0; |
1457 | } |
1458 | value += *p; |
1459 | ++p; |
1460 | } |
1461 | } |
1462 | return p; |
1463 | } |
1464 | |
1465 | #ifdef TIXML_USE_STL |
1466 | void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag ) |
1467 | { |
1468 | while ( in->good() ) |
1469 | { |
1470 | int c = in->peek(); |
1471 | if ( !cdata && (c == '<' ) ) |
1472 | { |
1473 | return; |
1474 | } |
1475 | if ( c <= 0 ) |
1476 | { |
1477 | TiXmlDocument* document = GetDocument(); |
1478 | if ( document ) |
1479 | document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); |
1480 | return; |
1481 | } |
1482 | |
1483 | (*tag) += (char) c; |
1484 | in->get(); // "commits" the peek made above |
1485 | |
1486 | if ( cdata && c == '>' && tag->size() >= 3 ) { |
1487 | size_t len = tag->size(); |
1488 | if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) { |
1489 | // terminator of cdata. |
1490 | return; |
1491 | } |
1492 | } |
1493 | } |
1494 | } |
1495 | #endif |
1496 | |
1497 | const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding ) |
1498 | { |
1499 | value = "" ; |
1500 | TiXmlDocument* document = GetDocument(); |
1501 | |
1502 | if ( data ) |
1503 | { |
1504 | data->Stamp( p, encoding ); |
1505 | location = data->Cursor(); |
1506 | } |
1507 | |
1508 | const char* const startTag = "<![CDATA[" ; |
1509 | const char* const endTag = "]]>" ; |
1510 | |
1511 | if ( cdata || StringEqual( p, startTag, false, encoding ) ) |
1512 | { |
1513 | cdata = true; |
1514 | |
1515 | if ( !StringEqual( p, startTag, false, encoding ) ) |
1516 | { |
1517 | if ( document ) |
1518 | document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding ); |
1519 | return 0; |
1520 | } |
1521 | p += strlen( startTag ); |
1522 | |
1523 | // Keep all the white space, ignore the encoding, etc. |
1524 | while ( p && *p |
1525 | && !StringEqual( p, endTag, false, encoding ) |
1526 | ) |
1527 | { |
1528 | value += *p; |
1529 | ++p; |
1530 | } |
1531 | |
1532 | TIXML_STRING dummy; |
1533 | p = ReadText( p, &dummy, false, endTag, false, encoding ); |
1534 | return p; |
1535 | } |
1536 | else |
1537 | { |
1538 | bool ignoreWhite = true; |
1539 | |
1540 | const char* end = "<" ; |
1541 | p = ReadText( p, &value, ignoreWhite, end, false, encoding ); |
1542 | if ( p && *p ) |
1543 | return p-1; // don't truncate the '<' |
1544 | return 0; |
1545 | } |
1546 | } |
1547 | |
1548 | #ifdef TIXML_USE_STL |
1549 | void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag ) |
1550 | { |
1551 | while ( in->good() ) |
1552 | { |
1553 | int c = in->get(); |
1554 | if ( c <= 0 ) |
1555 | { |
1556 | TiXmlDocument* document = GetDocument(); |
1557 | if ( document ) |
1558 | document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); |
1559 | return; |
1560 | } |
1561 | (*tag) += (char) c; |
1562 | |
1563 | if ( c == '>' ) |
1564 | { |
1565 | // All is well. |
1566 | return; |
1567 | } |
1568 | } |
1569 | } |
1570 | #endif |
1571 | |
1572 | const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding ) |
1573 | { |
1574 | p = SkipWhiteSpace( p, _encoding ); |
1575 | // Find the beginning, find the end, and look for |
1576 | // the stuff in-between. |
1577 | TiXmlDocument* document = GetDocument(); |
1578 | if ( !p || !*p || !StringEqual( p, "<?xml" , true, _encoding ) ) |
1579 | { |
1580 | if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding ); |
1581 | return 0; |
1582 | } |
1583 | if ( data ) |
1584 | { |
1585 | data->Stamp( p, _encoding ); |
1586 | location = data->Cursor(); |
1587 | } |
1588 | p += 5; |
1589 | |
1590 | version = "" ; |
1591 | encoding = "" ; |
1592 | standalone = "" ; |
1593 | |
1594 | while ( p && *p ) |
1595 | { |
1596 | if ( *p == '>' ) |
1597 | { |
1598 | ++p; |
1599 | return p; |
1600 | } |
1601 | |
1602 | p = SkipWhiteSpace( p, _encoding ); |
1603 | if ( StringEqual( p, "version" , true, _encoding ) ) |
1604 | { |
1605 | TiXmlAttribute attrib; |
1606 | p = attrib.Parse( p, data, _encoding ); |
1607 | version = attrib.Value(); |
1608 | } |
1609 | else if ( StringEqual( p, "encoding" , true, _encoding ) ) |
1610 | { |
1611 | TiXmlAttribute attrib; |
1612 | p = attrib.Parse( p, data, _encoding ); |
1613 | encoding = attrib.Value(); |
1614 | } |
1615 | else if ( StringEqual( p, "standalone" , true, _encoding ) ) |
1616 | { |
1617 | TiXmlAttribute attrib; |
1618 | p = attrib.Parse( p, data, _encoding ); |
1619 | standalone = attrib.Value(); |
1620 | } |
1621 | else |
1622 | { |
1623 | // Read over whatever it is. |
1624 | while( p && *p && *p != '>' && !IsWhiteSpace( *p ) ) |
1625 | ++p; |
1626 | } |
1627 | } |
1628 | return 0; |
1629 | } |
1630 | |
1631 | bool TiXmlText::Blank() const |
1632 | { |
1633 | for ( unsigned i=0; i<value.length(); i++ ) |
1634 | if ( !IsWhiteSpace( value[i] ) ) |
1635 | return false; |
1636 | return true; |
1637 | } |
1638 | |