123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635 |
- /*
- www.sourceforge.net/projects/tinyxml
- Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
- This software is provided 'as-is', without any express or implied
- warranty. In no event will the authors be held liable for any
- damages arising from the use of this software.
- Permission is granted to anyone to use this software for any
- purpose, including commercial applications, and to alter it and
- redistribute it freely, subject to the following restrictions:
- 1. The origin of this software must not be misrepresented; you must
- not claim that you wrote the original software. If you use this
- software in a product, an acknowledgment in the product documentation
- would be appreciated but is not required.
- 2. Altered source versions must be plainly marked as such, and
- must not be misrepresented as being the original software.
- 3. This notice may not be removed or altered from any source
- distribution.
- */
- #include <ctype.h>
- #include <stddef.h>
- #include "tinyxml.h"
- //#define DEBUG_PARSER
- #if defined( DEBUG_PARSER )
- # if defined( DEBUG ) && defined( _MSC_VER )
- # include <windows.h>
- # define TIXML_LOG OutputDebugString
- # else
- # define TIXML_LOG printf
- # endif
- #endif
- // Note tha "PutString" hardcodes the same list. This
- // is less flexible than it appears. Changing the entries
- // or order will break putstring.
- TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
- {
- { "&", 5, '&' },
- { "<", 4, '<' },
- { ">", 4, '>' },
- { """, 6, '\"' },
- { "'", 6, '\'' }
- };
- // Bunch of unicode info at:
- // http://www.unicode.org/faq/utf_bom.html
- // Including the basic of this table, which determines the #bytes in the
- // sequence from the lead byte. 1 placed for invalid sequences --
- // although the result will be junk, pass it through as much as possible.
- // Beware of the non-characters in UTF-8:
- // ef bb bf (Microsoft "lead bytes")
- // ef bf be
- // ef bf bf
- const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
- const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
- const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
- const int TiXmlBase::utf8ByteTable[256] =
- {
- // 0 1 2 3 4 5 6 7 8 9 a b c d e f
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0
- 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 0xe0 to 0xef 3 byte
- 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
- };
- void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
- {
- const unsigned long BYTE_MASK = 0xBF;
- const unsigned long BYTE_MARK = 0x80;
- const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
- if (input < 0x80)
- *length = 1;
- else if ( input < 0x800 )
- *length = 2;
- else if ( input < 0x10000 )
- *length = 3;
- else if ( input < 0x200000 )
- *length = 4;
- else
- { *length = 0; return; } // This code won't covert this correctly anyway.
- output += *length;
- // Scary scary fall throughs.
- switch (*length)
- {
- case 4:
- --output;
- *output = (char)((input | BYTE_MARK) & BYTE_MASK);
- input >>= 6;
- case 3:
- --output;
- *output = (char)((input | BYTE_MARK) & BYTE_MASK);
- input >>= 6;
- case 2:
- --output;
- *output = (char)((input | BYTE_MARK) & BYTE_MASK);
- input >>= 6;
- case 1:
- --output;
- *output = (char)(input | FIRST_BYTE_MARK[*length]);
- }
- }
- /*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
- {
- // This will only work for low-ascii, everything else is assumed to be a valid
- // letter. I'm not sure this is the best approach, but it is quite tricky trying
- // to figure out alhabetical vs. not across encoding. So take a very
- // conservative approach.
- // if ( encoding == TIXML_ENCODING_UTF8 )
- // {
- if ( anyByte < 127 )
- return isalpha( anyByte );
- else
- return 1; // What else to do? The unicode set is huge...get the english ones right.
- // }
- // else
- // {
- // return isalpha( anyByte );
- // }
- }
- /*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
- {
- // This will only work for low-ascii, everything else is assumed to be a valid
- // letter. I'm not sure this is the best approach, but it is quite tricky trying
- // to figure out alhabetical vs. not across encoding. So take a very
- // conservative approach.
- // if ( encoding == TIXML_ENCODING_UTF8 )
- // {
- if ( anyByte < 127 )
- return isalnum( anyByte );
- else
- return 1; // What else to do? The unicode set is huge...get the english ones right.
- // }
- // else
- // {
- // return isalnum( anyByte );
- // }
- }
- class TiXmlParsingData
- {
- friend class TiXmlDocument;
- public:
- void Stamp( const char* now, TiXmlEncoding encoding );
- const TiXmlCursor& Cursor() { return cursor; }
- private:
- // Only used by the document!
- TiXmlParsingData( const char* start, int _tabsize, int row, int col )
- {
- assert( start );
- stamp = start;
- tabsize = _tabsize;
- cursor.row = row;
- cursor.col = col;
- }
- TiXmlCursor cursor;
- const char* stamp;
- int tabsize;
- };
- void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
- {
- assert( now );
- // Do nothing if the tabsize is 0.
- if ( tabsize < 1 )
- {
- return;
- }
- // Get the current row, column.
- int row = cursor.row;
- int col = cursor.col;
- const char* p = stamp;
- assert( p );
- while ( p < now )
- {
- // Treat p as unsigned, so we have a happy compiler.
- const unsigned char* pU = (const unsigned char*)p;
- // Code contributed by Fletcher Dunn: (modified by lee)
- switch (*pU) {
- case 0:
- // We *should* never get here, but in case we do, don't
- // advance past the terminating null character, ever
- return;
- case '\r':
- // bump down to the next line
- ++row;
- col = 0;
- // Eat the character
- ++p;
- // Check for \r\n sequence, and treat this as a single character
- if (*p == '\n') {
- ++p;
- }
- break;
- case '\n':
- // bump down to the next line
- ++row;
- col = 0;
- // Eat the character
- ++p;
- // Check for \n\r sequence, and treat this as a single
- // character. (Yes, this bizarre thing does occur still
- // on some arcane platforms...)
- if (*p == '\r') {
- ++p;
- }
- break;
- case '\t':
- // Eat the character
- ++p;
- // Skip to next tab stop
- col = (col / tabsize + 1) * tabsize;
- break;
- case TIXML_UTF_LEAD_0:
- if ( encoding == TIXML_ENCODING_UTF8 )
- {
- if ( *(p+1) && *(p+2) )
- {
- // In these cases, don't advance the column. These are
- // 0-width spaces.
- if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
- p += 3;
- else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
- p += 3;
- else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
- p += 3;
- else
- { p +=3; ++col; } // A normal character.
- }
- }
- else
- {
- ++p;
- ++col;
- }
- break;
- default:
- if ( encoding == TIXML_ENCODING_UTF8 )
- {
- // Eat the 1 to 4 byte utf8 character.
- int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)];
- if ( step == 0 )
- step = 1; // Error case from bad encoding, but handle gracefully.
- p += step;
- // Just advance one column, of course.
- ++col;
- }
- else
- {
- ++p;
- ++col;
- }
- break;
- }
- }
- cursor.row = row;
- cursor.col = col;
- assert( cursor.row >= -1 );
- assert( cursor.col >= -1 );
- stamp = p;
- assert( stamp );
- }
- const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
- {
- if ( !p || !*p )
- {
- return 0;
- }
- if ( encoding == TIXML_ENCODING_UTF8 )
- {
- while ( *p )
- {
- const unsigned char* pU = (const unsigned char*)p;
-
- // Skip the stupid Microsoft UTF-8 Byte order marks
- if ( *(pU+0)==TIXML_UTF_LEAD_0
- && *(pU+1)==TIXML_UTF_LEAD_1
- && *(pU+2)==TIXML_UTF_LEAD_2 )
- {
- p += 3;
- continue;
- }
- else if(*(pU+0)==TIXML_UTF_LEAD_0
- && *(pU+1)==0xbfU
- && *(pU+2)==0xbeU )
- {
- p += 3;
- continue;
- }
- else if(*(pU+0)==TIXML_UTF_LEAD_0
- && *(pU+1)==0xbfU
- && *(pU+2)==0xbfU )
- {
- p += 3;
- continue;
- }
- if ( IsWhiteSpace( *p ) ) // Still using old rules for white space.
- ++p;
- else
- break;
- }
- }
- else
- {
- while ( *p && IsWhiteSpace( *p ) )
- ++p;
- }
- return p;
- }
- #ifdef TIXML_USE_STL
- /*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag )
- {
- for( ;; )
- {
- if ( !in->good() ) return false;
- int c = in->peek();
- // At this scope, we can't get to a document. So fail silently.
- if ( !IsWhiteSpace( c ) || c <= 0 )
- return true;
- *tag += (char) in->get();
- }
- }
- /*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag )
- {
- //assert( character > 0 && character < 128 ); // else it won't work in utf-8
- while ( in->good() )
- {
- int c = in->peek();
- if ( c == character )
- return true;
- if ( c <= 0 ) // Silent failure: can't get document at this scope
- return false;
- in->get();
- *tag += (char) c;
- }
- return false;
- }
- #endif
- // One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The
- // "assign" optimization removes over 10% of the execution time.
- //
- const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
- {
- // Oddly, not supported on some comilers,
- //name->clear();
- // So use this:
- *name = "";
- assert( p );
- // Names start with letters or underscores.
- // Of course, in unicode, tinyxml has no idea what a letter *is*. The
- // algorithm is generous.
- //
- // After that, they can be letters, underscores, numbers,
- // hyphens, or colons. (Colons are valid ony for namespaces,
- // but tinyxml can't tell namespaces from names.)
- if ( p && *p
- && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
- {
- const char* start = p;
- while( p && *p
- && ( IsAlphaNum( (unsigned char ) *p, encoding )
- || *p == '_'
- || *p == '-'
- || *p == '.'
- || *p == ':' ) )
- {
- //(*name) += *p; // expensive
- ++p;
- }
- if ( p-start > 0 ) {
- name->assign( start, p-start );
- }
- return p;
- }
- return 0;
- }
- const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
- {
- // Presume an entity, and pull it out.
- TIXML_STRING ent;
- int i;
- *length = 0;
- if ( *(p+1) && *(p+1) == '#' && *(p+2) )
- {
- unsigned long ucs = 0;
- ptrdiff_t delta = 0;
- unsigned mult = 1;
- if ( *(p+2) == 'x' )
- {
- // Hexadecimal.
- if ( !*(p+3) ) return 0;
- const char* q = p+3;
- q = strchr( q, ';' );
- if ( !q || !*q ) return 0;
- delta = q-p;
- --q;
- while ( *q != 'x' )
- {
- if ( *q >= '0' && *q <= '9' )
- ucs += mult * (*q - '0');
- else if ( *q >= 'a' && *q <= 'f' )
- ucs += mult * (*q - 'a' + 10);
- else if ( *q >= 'A' && *q <= 'F' )
- ucs += mult * (*q - 'A' + 10 );
- else
- return 0;
- mult *= 16;
- --q;
- }
- }
- else
- {
- // Decimal.
- if ( !*(p+2) ) return 0;
- const char* q = p+2;
- q = strchr( q, ';' );
- if ( !q || !*q ) return 0;
- delta = q-p;
- --q;
- while ( *q != '#' )
- {
- if ( *q >= '0' && *q <= '9' )
- ucs += mult * (*q - '0');
- else
- return 0;
- mult *= 10;
- --q;
- }
- }
- if ( encoding == TIXML_ENCODING_UTF8 )
- {
- // convert the UCS to UTF-8
- ConvertUTF32ToUTF8( ucs, value, length );
- }
- else
- {
- *value = (char)ucs;
- *length = 1;
- }
- return p + delta + 1;
- }
- // Now try to match it.
- for( i=0; i<NUM_ENTITY; ++i )
- {
- if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
- {
- assert( strlen( entity[i].str ) == entity[i].strLength );
- *value = entity[i].chr;
- *length = 1;
- return ( p + entity[i].strLength );
- }
- }
- // So it wasn't an entity, its unrecognized, or something like that.
- *value = *p; // Don't put back the last one, since we return it!
- //*length = 1; // Leave unrecognized entities - this doesn't really work.
- // Just writes strange XML.
- return p+1;
- }
- bool TiXmlBase::StringEqual( const char* p,
- const char* tag,
- bool ignoreCase,
- TiXmlEncoding encoding )
- {
- assert( p );
- assert( tag );
- if ( !p || !*p )
- {
- assert( 0 );
- return false;
- }
- const char* q = p;
- if ( ignoreCase )
- {
- while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
- {
- ++q;
- ++tag;
- }
- if ( *tag == 0 )
- return true;
- }
- else
- {
- while ( *q && *tag && *q == *tag )
- {
- ++q;
- ++tag;
- }
- if ( *tag == 0 ) // Have we found the end of the tag, and everything equal?
- return true;
- }
- return false;
- }
- const char* TiXmlBase::ReadText( const char* p,
- TIXML_STRING * text,
- bool trimWhiteSpace,
- const char* endTag,
- bool caseInsensitive,
- TiXmlEncoding encoding )
- {
- *text = "";
- if ( !trimWhiteSpace // certain tags always keep whitespace
- || !condenseWhiteSpace ) // if true, whitespace is always kept
- {
- // Keep all the white space.
- while ( p && *p
- && !StringEqual( p, endTag, caseInsensitive, encoding )
- )
- {
- int len;
- char cArr[4] = { 0, 0, 0, 0 };
- p = GetChar( p, cArr, &len, encoding );
- text->append( cArr, len );
- }
- }
- else
- {
- bool whitespace = false;
- // Remove leading white space:
- p = SkipWhiteSpace( p, encoding );
- while ( p && *p
- && !StringEqual( p, endTag, caseInsensitive, encoding ) )
- {
- if ( *p == '\r' || *p == '\n' )
- {
- whitespace = true;
- ++p;
- }
- else if ( IsWhiteSpace( *p ) )
- {
- whitespace = true;
- ++p;
- }
- else
- {
- // If we've found whitespace, add it before the
- // new character. Any whitespace just becomes a space.
- if ( whitespace )
- {
- (*text) += ' ';
- whitespace = false;
- }
- int len;
- char cArr[4] = { 0, 0, 0, 0 };
- p = GetChar( p, cArr, &len, encoding );
- if ( len == 1 )
- (*text) += cArr[0]; // more efficient
- else
- text->append( cArr, len );
- }
- }
- }
- if ( p && *p )
- p += strlen( endTag );
- return p;
- }
- #ifdef TIXML_USE_STL
- void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag )
- {
- // The basic issue with a document is that we don't know what we're
- // streaming. Read something presumed to be a tag (and hope), then
- // identify it, and call the appropriate stream method on the tag.
- //
- // This "pre-streaming" will never read the closing ">" so the
- // sub-tag can orient itself.
- if ( !StreamTo( in, '<', tag ) )
- {
- SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
- return;
- }
- while ( in->good() )
- {
- int tagIndex = (int) tag->length();
- while ( in->good() && in->peek() != '>' )
- {
- int c = in->get();
- if ( c <= 0 )
- {
- SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
- break;
- }
- (*tag) += (char) c;
- }
- if ( in->good() )
- {
- // We now have something we presume to be a node of
- // some sort. Identify it, and call the node to
- // continue streaming.
- TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
- if ( node )
- {
- node->StreamIn( in, tag );
- bool isElement = node->ToElement() != 0;
- delete node;
- node = 0;
- // If this is the root element, we're done. Parsing will be
- // done by the >> operator.
- if ( isElement )
- {
- return;
- }
- }
- else
- {
- SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
- return;
- }
- }
- }
- // We should have returned sooner.
- SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
- }
- #endif
- const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
- {
- ClearError();
- // Parse away, at the document level. Since a document
- // contains nothing but other tags, most of what happens
- // here is skipping white space.
- if ( !p || !*p )
- {
- SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
- return 0;
- }
- // Note that, for a document, this needs to come
- // before the while space skip, so that parsing
- // starts from the pointer we are given.
- location.Clear();
- if ( prevData )
- {
- location.row = prevData->cursor.row;
- location.col = prevData->cursor.col;
- }
- else
- {
- location.row = 0;
- location.col = 0;
- }
- TiXmlParsingData data( p, TabSize(), location.row, location.col );
- location = data.Cursor();
- if ( encoding == TIXML_ENCODING_UNKNOWN )
- {
- // Check for the Microsoft UTF-8 lead bytes.
- const unsigned char* pU = (const unsigned char*)p;
- if ( *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
- && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
- && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
- {
- encoding = TIXML_ENCODING_UTF8;
- useMicrosoftBOM = true;
- }
- }
- p = SkipWhiteSpace( p, encoding );
- if ( !p )
- {
- SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
- return 0;
- }
- while ( p && *p )
- {
- TiXmlNode* node = Identify( p, encoding );
- if ( node )
- {
- p = node->Parse( p, &data, encoding );
- LinkEndChild( node );
- }
- else
- {
- break;
- }
- // Did we get encoding info?
- if ( encoding == TIXML_ENCODING_UNKNOWN
- && node->ToDeclaration() )
- {
- TiXmlDeclaration* dec = node->ToDeclaration();
- const char* enc = dec->Encoding();
- assert( enc );
- if ( *enc == 0 )
- encoding = TIXML_ENCODING_UTF8;
- else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
- encoding = TIXML_ENCODING_UTF8;
- else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
- encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice
- else
- encoding = TIXML_ENCODING_LEGACY;
- }
- p = SkipWhiteSpace( p, encoding );
- }
- // Was this empty?
- if ( !firstChild ) {
- SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
- return 0;
- }
- // All is well.
- return p;
- }
- void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
- {
- // The first error in a chain is more accurate - don't set again!
- if ( error )
- return;
- assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
- error = true;
- errorId = err;
- errorDesc = errorString[ errorId ];
- errorLocation.Clear();
- if ( pError && data )
- {
- data->Stamp( pError, encoding );
- errorLocation = data->Cursor();
- }
- }
- TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
- {
- TiXmlNode* returnNode = 0;
- p = SkipWhiteSpace( p, encoding );
- if( !p || !*p || *p != '<' )
- {
- return 0;
- }
- p = SkipWhiteSpace( p, encoding );
- if ( !p || !*p )
- {
- return 0;
- }
- // What is this thing?
- // - Elements start with a letter or underscore, but xml is reserved.
- // - Comments: <!--
- // - Decleration: <?xml
- // - Everthing else is unknown to tinyxml.
- //
- const char* xmlHeader = { "<?xml" };
- const char* commentHeader = { "<!--" };
- const char* dtdHeader = { "<!" };
- const char* cdataHeader = { "<![CDATA[" };
- if ( StringEqual( p, xmlHeader, true, encoding ) )
- {
- #ifdef DEBUG_PARSER
- TIXML_LOG( "XML parsing Declaration\n" );
- #endif
- returnNode = new TiXmlDeclaration();
- }
- else if ( StringEqual( p, commentHeader, false, encoding ) )
- {
- #ifdef DEBUG_PARSER
- TIXML_LOG( "XML parsing Comment\n" );
- #endif
- returnNode = new TiXmlComment();
- }
- else if ( StringEqual( p, cdataHeader, false, encoding ) )
- {
- #ifdef DEBUG_PARSER
- TIXML_LOG( "XML parsing CDATA\n" );
- #endif
- TiXmlText* text = new TiXmlText( "" );
- text->SetCDATA( true );
- returnNode = text;
- }
- else if ( StringEqual( p, dtdHeader, false, encoding ) )
- {
- #ifdef DEBUG_PARSER
- TIXML_LOG( "XML parsing Unknown(1)\n" );
- #endif
- returnNode = new TiXmlUnknown();
- }
- else if ( IsAlpha( *(p+1), encoding )
- || *(p+1) == '_' )
- {
- #ifdef DEBUG_PARSER
- TIXML_LOG( "XML parsing Element\n" );
- #endif
- returnNode = new TiXmlElement( "" );
- }
- else
- {
- #ifdef DEBUG_PARSER
- TIXML_LOG( "XML parsing Unknown(2)\n" );
- #endif
- returnNode = new TiXmlUnknown();
- }
- if ( returnNode )
- {
- // Set the parent, so it can report errors
- returnNode->parent = this;
- }
- return returnNode;
- }
- #ifdef TIXML_USE_STL
- void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag)
- {
- // We're called with some amount of pre-parsing. That is, some of "this"
- // element is in "tag". Go ahead and stream to the closing ">"
- while( in->good() )
- {
- int c = in->get();
- if ( c <= 0 )
- {
- TiXmlDocument* document = GetDocument();
- if ( document )
- document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
- return;
- }
- (*tag) += (char) c ;
-
- if ( c == '>' )
- break;
- }
- if ( tag->length() < 3 ) return;
- // Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
- // If not, identify and stream.
- if ( tag->at( tag->length() - 1 ) == '>'
- && tag->at( tag->length() - 2 ) == '/' )
- {
- // All good!
- return;
- }
- else if ( tag->at( tag->length() - 1 ) == '>' )
- {
- // There is more. Could be:
- // text
- // cdata text (which looks like another node)
- // closing tag
- // another node.
- for ( ;; )
- {
- StreamWhiteSpace( in, tag );
- // Do we have text?
- if ( in->good() && in->peek() != '<' )
- {
- // Yep, text.
- TiXmlText text( "" );
- text.StreamIn( in, tag );
- // What follows text is a closing tag or another node.
- // Go around again and figure it out.
- continue;
- }
- // We now have either a closing tag...or another node.
- // We should be at a "<", regardless.
- if ( !in->good() ) return;
- assert( in->peek() == '<' );
- int tagIndex = (int) tag->length();
- bool closingTag = false;
- bool firstCharFound = false;
- for( ;; )
- {
- if ( !in->good() )
- return;
- int c = in->peek();
- if ( c <= 0 )
- {
- TiXmlDocument* document = GetDocument();
- if ( document )
- document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
- return;
- }
-
- if ( c == '>' )
- break;
- *tag += (char) c;
- in->get();
- // Early out if we find the CDATA id.
- if ( c == '[' && tag->size() >= 9 )
- {
- size_t len = tag->size();
- const char* start = tag->c_str() + len - 9;
- if ( strcmp( start, "<![CDATA[" ) == 0 ) {
- assert( !closingTag );
- break;
- }
- }
- if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
- {
- firstCharFound = true;
- if ( c == '/' )
- closingTag = true;
- }
- }
- // If it was a closing tag, then read in the closing '>' to clean up the input stream.
- // If it was not, the streaming will be done by the tag.
- if ( closingTag )
- {
- if ( !in->good() )
- return;
- int c = in->get();
- if ( c <= 0 )
- {
- TiXmlDocument* document = GetDocument();
- if ( document )
- document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
- return;
- }
- assert( c == '>' );
- *tag += (char) c;
- // We are done, once we've found our closing tag.
- return;
- }
- else
- {
- // If not a closing tag, id it, and stream.
- const char* tagloc = tag->c_str() + tagIndex;
- TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
- if ( !node )
- return;
- node->StreamIn( in, tag );
- delete node;
- node = 0;
- // No return: go around from the beginning: text, closing tag, or node.
- }
- }
- }
- }
- #endif
- const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
- {
- p = SkipWhiteSpace( p, encoding );
- TiXmlDocument* document = GetDocument();
- if ( !p || !*p )
- {
- if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
- return 0;
- }
- if ( data )
- {
- data->Stamp( p, encoding );
- location = data->Cursor();
- }
- if ( *p != '<' )
- {
- if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
- return 0;
- }
- p = SkipWhiteSpace( p+1, encoding );
- // Read the name.
- const char* pErr = p;
- p = ReadName( p, &value, encoding );
- if ( !p || !*p )
- {
- if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
- return 0;
- }
- TIXML_STRING endTag ("</");
- endTag += value;
- // Check for and read attributes. Also look for an empty
- // tag or an end tag.
- while ( p && *p )
- {
- pErr = p;
- p = SkipWhiteSpace( p, encoding );
- if ( !p || !*p )
- {
- if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
- return 0;
- }
- if ( *p == '/' )
- {
- ++p;
- // Empty tag.
- if ( *p != '>' )
- {
- if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
- return 0;
- }
- return (p+1);
- }
- else if ( *p == '>' )
- {
- // Done with attributes (if there were any.)
- // Read the value -- which can include other
- // elements -- read the end tag, and return.
- ++p;
- p = ReadValue( p, data, encoding ); // Note this is an Element method, and will set the error if one happens.
- if ( !p || !*p ) {
- // We were looking for the end tag, but found nothing.
- // Fix for [ 1663758 ] Failure to report error on bad XML
- if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
- return 0;
- }
- // We should find the end tag now
- // note that:
- // </foo > and
- // </foo>
- // are both valid end tags.
- if ( StringEqual( p, endTag.c_str(), false, encoding ) )
- {
- p += endTag.length();
- p = SkipWhiteSpace( p, encoding );
- if ( p && *p && *p == '>' ) {
- ++p;
- return p;
- }
- if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
- return 0;
- }
- else
- {
- if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
- return 0;
- }
- }
- else
- {
- // Try to read an attribute:
- TiXmlAttribute* attrib = new TiXmlAttribute();
- if ( !attrib )
- {
- return 0;
- }
- attrib->SetDocument( document );
- pErr = p;
- p = attrib->Parse( p, data, encoding );
- if ( !p || !*p )
- {
- if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
- delete attrib;
- return 0;
- }
- // Handle the strange case of double attributes:
- #ifdef TIXML_USE_STL
- TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() );
- #else
- TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
- #endif
- if ( node )
- {
- if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
- delete attrib;
- return 0;
- }
- attributeSet.Add( attrib );
- }
- }
- return p;
- }
- const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
- {
- TiXmlDocument* document = GetDocument();
- // Read in text and elements in any order.
- const char* pWithWhiteSpace = p;
- p = SkipWhiteSpace( p, encoding );
- while ( p && *p )
- {
- if ( *p != '<' )
- {
- // Take what we have, make a text element.
- TiXmlText* textNode = new TiXmlText( "" );
- if ( !textNode )
- {
- return 0;
- }
- if ( TiXmlBase::IsWhiteSpaceCondensed() )
- {
- p = textNode->Parse( p, data, encoding );
- }
- else
- {
- // Special case: we want to keep the white space
- // so that leading spaces aren't removed.
- p = textNode->Parse( pWithWhiteSpace, data, encoding );
- }
- if ( !textNode->Blank() )
- LinkEndChild( textNode );
- else
- delete textNode;
- }
- else
- {
- // We hit a '<'
- // Have we hit a new element or an end tag? This could also be
- // a TiXmlText in the "CDATA" style.
- if ( StringEqual( p, "</", false, encoding ) )
- {
- return p;
- }
- else
- {
- TiXmlNode* node = Identify( p, encoding );
- if ( node )
- {
- p = node->Parse( p, data, encoding );
- LinkEndChild( node );
- }
- else
- {
- return 0;
- }
- }
- }
- pWithWhiteSpace = p;
- p = SkipWhiteSpace( p, encoding );
- }
- if ( !p )
- {
- if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
- }
- return p;
- }
- #ifdef TIXML_USE_STL
- void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag )
- {
- while ( in->good() )
- {
- int c = in->get();
- if ( c <= 0 )
- {
- TiXmlDocument* document = GetDocument();
- if ( document )
- document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
- return;
- }
- (*tag) += (char) c;
- if ( c == '>' )
- {
- // All is well.
- return;
- }
- }
- }
- #endif
- const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
- {
- TiXmlDocument* document = GetDocument();
- p = SkipWhiteSpace( p, encoding );
- if ( data )
- {
- data->Stamp( p, encoding );
- location = data->Cursor();
- }
- if ( !p || !*p || *p != '<' )
- {
- if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
- return 0;
- }
- ++p;
- value = "";
- while ( p && *p && *p != '>' )
- {
- value += *p;
- ++p;
- }
- if ( !p )
- {
- if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
- }
- if ( *p == '>' )
- return p+1;
- return p;
- }
- #ifdef TIXML_USE_STL
- void TiXmlComment::StreamIn( std::istream * in, TIXML_STRING * tag )
- {
- while ( in->good() )
- {
- int c = in->get();
- if ( c <= 0 )
- {
- TiXmlDocument* document = GetDocument();
- if ( document )
- document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
- return;
- }
- (*tag) += (char) c;
- if ( c == '>'
- && tag->at( tag->length() - 2 ) == '-'
- && tag->at( tag->length() - 3 ) == '-' )
- {
- // All is well.
- return;
- }
- }
- }
- #endif
- const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
- {
- TiXmlDocument* document = GetDocument();
- value = "";
- p = SkipWhiteSpace( p, encoding );
- if ( data )
- {
- data->Stamp( p, encoding );
- location = data->Cursor();
- }
- const char* startTag = "<!--";
- const char* endTag = "-->";
- if ( !StringEqual( p, startTag, false, encoding ) )
- {
- document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
- return 0;
- }
- p += strlen( startTag );
- // [ 1475201 ] TinyXML parses entities in comments
- // Oops - ReadText doesn't work, because we don't want to parse the entities.
- // p = ReadText( p, &value, false, endTag, false, encoding );
- //
- // from the XML spec:
- /*
- [Definition: Comments may appear anywhere in a document outside other markup; in addition,
- they may appear within the document type declaration at places allowed by the grammar.
- They are not part of the document's character data; an XML processor MAY, but need not,
- make it possible for an application to retrieve the text of comments. For compatibility,
- the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity
- references MUST NOT be recognized within comments.
- An example of a comment:
- <!-- declarations for <head> & <body> -->
- */
- value = "";
- // Keep all the white space.
- while ( p && *p && !StringEqual( p, endTag, false, encoding ) )
- {
- value.append( p, 1 );
- ++p;
- }
- if ( p && *p )
- p += strlen( endTag );
- return p;
- }
- const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
- {
- p = SkipWhiteSpace( p, encoding );
- if ( !p || !*p ) return 0;
- if ( data )
- {
- data->Stamp( p, encoding );
- location = data->Cursor();
- }
- // Read the name, the '=' and the value.
- const char* pErr = p;
- p = ReadName( p, &name, encoding );
- if ( !p || !*p )
- {
- if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
- return 0;
- }
- p = SkipWhiteSpace( p, encoding );
- if ( !p || !*p || *p != '=' )
- {
- if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
- return 0;
- }
- ++p; // skip '='
- p = SkipWhiteSpace( p, encoding );
- if ( !p || !*p )
- {
- if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
- return 0;
- }
-
- const char* end;
- const char SINGLE_QUOTE = '\'';
- const char DOUBLE_QUOTE = '\"';
- if ( *p == SINGLE_QUOTE )
- {
- ++p;
- end = "\'"; // single quote in string
- p = ReadText( p, &value, false, end, false, encoding );
- }
- else if ( *p == DOUBLE_QUOTE )
- {
- ++p;
- end = "\""; // double quote in string
- p = ReadText( p, &value, false, end, false, encoding );
- }
- else
- {
- // All attribute values should be in single or double quotes.
- // But this is such a common error that the parser will try
- // its best, even without them.
- value = "";
- while ( p && *p // existence
- && !IsWhiteSpace( *p ) // whitespace
- && *p != '/' && *p != '>' ) // tag end
- {
- if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) {
- // [ 1451649 ] Attribute values with trailing quotes not handled correctly
- // We did not have an opening quote but seem to have a
- // closing one. Give up and throw an error.
- if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
- return 0;
- }
- value += *p;
- ++p;
- }
- }
- return p;
- }
- #ifdef TIXML_USE_STL
- void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag )
- {
- while ( in->good() )
- {
- int c = in->peek();
- if ( !cdata && (c == '<' ) )
- {
- return;
- }
- if ( c <= 0 )
- {
- TiXmlDocument* document = GetDocument();
- if ( document )
- document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
- return;
- }
- (*tag) += (char) c;
- in->get(); // "commits" the peek made above
- if ( cdata && c == '>' && tag->size() >= 3 ) {
- size_t len = tag->size();
- if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) {
- // terminator of cdata.
- return;
- }
- }
- }
- }
- #endif
- const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
- {
- value = "";
- TiXmlDocument* document = GetDocument();
- if ( data )
- {
- data->Stamp( p, encoding );
- location = data->Cursor();
- }
- const char* const startTag = "<![CDATA[";
- const char* const endTag = "]]>";
- if ( cdata || StringEqual( p, startTag, false, encoding ) )
- {
- cdata = true;
- if ( !StringEqual( p, startTag, false, encoding ) )
- {
- document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
- return 0;
- }
- p += strlen( startTag );
- // Keep all the white space, ignore the encoding, etc.
- while ( p && *p
- && !StringEqual( p, endTag, false, encoding )
- )
- {
- value += *p;
- ++p;
- }
- TIXML_STRING dummy;
- p = ReadText( p, &dummy, false, endTag, false, encoding );
- return p;
- }
- else
- {
- bool ignoreWhite = true;
- const char* end = "<";
- p = ReadText( p, &value, ignoreWhite, end, false, encoding );
- if ( p )
- return p-1; // don't truncate the '<'
- return 0;
- }
- }
- #ifdef TIXML_USE_STL
- void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag )
- {
- while ( in->good() )
- {
- int c = in->get();
- if ( c <= 0 )
- {
- TiXmlDocument* document = GetDocument();
- if ( document )
- document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
- return;
- }
- (*tag) += (char) c;
- if ( c == '>' )
- {
- // All is well.
- return;
- }
- }
- }
- #endif
- const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
- {
- p = SkipWhiteSpace( p, _encoding );
- // Find the beginning, find the end, and look for
- // the stuff in-between.
- TiXmlDocument* document = GetDocument();
- if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
- {
- if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
- return 0;
- }
- if ( data )
- {
- data->Stamp( p, _encoding );
- location = data->Cursor();
- }
- p += 5;
- version = "";
- encoding = "";
- standalone = "";
- while ( p && *p )
- {
- if ( *p == '>' )
- {
- ++p;
- return p;
- }
- p = SkipWhiteSpace( p, _encoding );
- if ( StringEqual( p, "version", true, _encoding ) )
- {
- TiXmlAttribute attrib;
- p = attrib.Parse( p, data, _encoding );
- version = attrib.Value();
- }
- else if ( StringEqual( p, "encoding", true, _encoding ) )
- {
- TiXmlAttribute attrib;
- p = attrib.Parse( p, data, _encoding );
- encoding = attrib.Value();
- }
- else if ( StringEqual( p, "standalone", true, _encoding ) )
- {
- TiXmlAttribute attrib;
- p = attrib.Parse( p, data, _encoding );
- standalone = attrib.Value();
- }
- else
- {
- // Read over whatever it is.
- while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
- ++p;
- }
- }
- return 0;
- }
- bool TiXmlText::Blank() const
- {
- for ( unsigned i=0; i<value.length(); i++ )
- if ( !IsWhiteSpace( value[i] ) )
- return false;
- return true;
- }
|