CXMLReaderImpl.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821
  1. // Copyright (C) 2002-2012 Nikolaus Gebhardt
  2. // This file is part of the "Irrlicht Engine" and the "irrXML" project.
  3. // For conditions of distribution and use, see copyright notice in irrlicht.h and/or irrXML.h
  4. #ifndef __ICXML_READER_IMPL_H_INCLUDED__
  5. #define __ICXML_READER_IMPL_H_INCLUDED__
  6. #include "irrXML.h"
  7. #include "irrString.h"
  8. #include "irrArray.h"
  9. #include "fast_atof.h"
  10. #ifdef _DEBUG
  11. #define IRR_DEBUGPRINT(x) printf((x));
  12. #else // _DEBUG
  13. #define IRR_DEBUGPRINT(x)
  14. #endif // _DEBUG
  15. namespace irr
  16. {
  17. namespace io
  18. {
  19. //! implementation of the IrrXMLReader
  20. template<class char_type, class superclass>
  21. class CXMLReaderImpl : public IIrrXMLReader<char_type, superclass>
  22. {
  23. public:
  24. //! Constructor
  25. CXMLReaderImpl(IFileReadCallBack* callback, bool deleteCallBack = true)
  26. : IgnoreWhitespaceText(true), TextData(0), P(0), TextBegin(0), TextSize(0), CurrentNodeType(EXN_NONE),
  27. SourceFormat(ETF_ASCII), TargetFormat(ETF_ASCII), IsEmptyElement(false)
  28. {
  29. if (!callback)
  30. return;
  31. storeTargetFormat();
  32. // read whole xml file
  33. readFile(callback);
  34. // clean up
  35. if (deleteCallBack)
  36. delete callback;
  37. // create list with special characters
  38. createSpecialCharacterList();
  39. // set pointer to text begin
  40. P = TextBegin;
  41. }
  42. //! Destructor
  43. virtual ~CXMLReaderImpl()
  44. {
  45. delete [] TextData;
  46. }
  47. //! Reads forward to the next xml node.
  48. //! \return Returns false, if there was no further node.
  49. virtual bool read()
  50. {
  51. // if not end reached, parse the node
  52. if (P && ((unsigned int)(P - TextBegin) < TextSize - 1) && (*P != 0))
  53. {
  54. return parseCurrentNode();
  55. }
  56. _IRR_IMPLEMENT_MANAGED_MARSHALLING_BUGFIX;
  57. return false;
  58. }
  59. //! Returns the type of the current XML node.
  60. virtual EXML_NODE getNodeType() const
  61. {
  62. return CurrentNodeType;
  63. }
  64. //! Returns attribute count of the current XML node.
  65. virtual unsigned int getAttributeCount() const
  66. {
  67. return Attributes.size();
  68. }
  69. //! Returns name of an attribute.
  70. virtual const char_type* getAttributeName(int idx) const
  71. {
  72. if ((u32)idx >= Attributes.size())
  73. return 0;
  74. return Attributes[idx].Name.c_str();
  75. }
  76. //! Returns the value of an attribute.
  77. virtual const char_type* getAttributeValue(int idx) const
  78. {
  79. if ((unsigned int)idx >= Attributes.size())
  80. return 0;
  81. return Attributes[idx].Value.c_str();
  82. }
  83. //! Returns the value of an attribute.
  84. virtual const char_type* getAttributeValue(const char_type* name) const
  85. {
  86. const SAttribute* attr = getAttributeByName(name);
  87. if (!attr)
  88. return 0;
  89. return attr->Value.c_str();
  90. }
  91. //! Returns the value of an attribute
  92. virtual const char_type* getAttributeValueSafe(const char_type* name) const
  93. {
  94. const SAttribute* attr = getAttributeByName(name);
  95. if (!attr)
  96. return EmptyString.c_str();
  97. return attr->Value.c_str();
  98. }
  99. //! Returns the value of an attribute as integer.
  100. int getAttributeValueAsInt(const char_type* name) const
  101. {
  102. const SAttribute* attr = getAttributeByName(name);
  103. if (!attr)
  104. return 0;
  105. core::stringc c(attr->Value.c_str());
  106. return core::strtol10(c.c_str());
  107. }
  108. //! Returns the value of an attribute as integer.
  109. int getAttributeValueAsInt(int idx) const
  110. {
  111. const char_type* attrvalue = getAttributeValue(idx);
  112. if (!attrvalue)
  113. return 0;
  114. core::stringc c(attrvalue);
  115. return core::strtol10(c.c_str());
  116. }
  117. //! Returns the value of an attribute as float.
  118. float getAttributeValueAsFloat(const char_type* name) const
  119. {
  120. const SAttribute* attr = getAttributeByName(name);
  121. if (!attr)
  122. return 0;
  123. core::stringc c = attr->Value.c_str();
  124. return core::fast_atof(c.c_str());
  125. }
  126. //! Returns the value of an attribute as float.
  127. float getAttributeValueAsFloat(int idx) const
  128. {
  129. const char_type* attrvalue = getAttributeValue(idx);
  130. if (!attrvalue)
  131. return 0;
  132. core::stringc c = attrvalue;
  133. return core::fast_atof(c.c_str());
  134. }
  135. //! Returns the name of the current node.
  136. virtual const char_type* getNodeName() const
  137. {
  138. return NodeName.c_str();
  139. }
  140. //! Returns data of the current node.
  141. virtual const char_type* getNodeData() const
  142. {
  143. return NodeName.c_str();
  144. }
  145. //! Returns if an element is an empty element, like <foo />
  146. virtual bool isEmptyElement() const
  147. {
  148. return IsEmptyElement;
  149. }
  150. //! Returns format of the source xml file.
  151. virtual ETEXT_FORMAT getSourceFormat() const
  152. {
  153. return SourceFormat;
  154. }
  155. //! Returns format of the strings returned by the parser.
  156. virtual ETEXT_FORMAT getParserFormat() const
  157. {
  158. return TargetFormat;
  159. }
  160. private:
  161. // Reads the current xml node
  162. // return false if no further node is found
  163. bool parseCurrentNode()
  164. {
  165. char_type* start = P;
  166. // more forward until '<' found
  167. while(*P != L'<' && *P)
  168. ++P;
  169. // not a node, so return false
  170. if (!*P)
  171. return false;
  172. if (P - start > 0)
  173. {
  174. // we found some text, store it
  175. if (setText(start, P))
  176. return true;
  177. }
  178. ++P;
  179. // based on current token, parse and report next element
  180. switch(*P)
  181. {
  182. case L'/':
  183. parseClosingXMLElement();
  184. break;
  185. case L'?':
  186. ignoreDefinition();
  187. break;
  188. case L'!':
  189. if (!parseCDATA())
  190. parseComment();
  191. break;
  192. default:
  193. parseOpeningXMLElement();
  194. break;
  195. }
  196. return true;
  197. }
  198. //! sets the state that text was found. Returns true if set should be set
  199. bool setText(char_type* start, char_type* end)
  200. {
  201. // By default xml preserves all whitespace. But Irrlicht dropped some whitespace by default
  202. // in the past which did lead to OS dependent behavior. We just ignore all whitespace for now
  203. // as it's the closest to fixing behavior without breaking downward compatibility too much.
  204. if ( IgnoreWhitespaceText )
  205. {
  206. char_type* p = start;
  207. for(; p != end; ++p)
  208. if (!isWhiteSpace(*p))
  209. break;
  210. if (p == end)
  211. return false;
  212. }
  213. // set current text to the parsed text, and replace xml special characters
  214. core::string<char_type> s(start, (int)(end - start));
  215. NodeName = replaceSpecialCharacters(s);
  216. // current XML node type is text
  217. CurrentNodeType = EXN_TEXT;
  218. return true;
  219. }
  220. //! ignores an xml definition like <?xml something />
  221. void ignoreDefinition()
  222. {
  223. CurrentNodeType = EXN_UNKNOWN;
  224. // move until end marked with '>' reached
  225. while(*P != L'>')
  226. ++P;
  227. ++P;
  228. }
  229. //! parses a comment
  230. void parseComment()
  231. {
  232. CurrentNodeType = EXN_COMMENT;
  233. P += 1;
  234. char_type *pCommentBegin = P;
  235. int count = 1;
  236. // move until end of comment reached
  237. while(count)
  238. {
  239. if (*P == L'>')
  240. --count;
  241. else
  242. if (*P == L'<')
  243. ++count;
  244. ++P;
  245. }
  246. P -= 3;
  247. NodeName = core::string<char_type>(pCommentBegin+2, (int)(P - pCommentBegin-2));
  248. P += 3;
  249. }
  250. //! parses an opening xml element and reads attributes
  251. void parseOpeningXMLElement()
  252. {
  253. CurrentNodeType = EXN_ELEMENT;
  254. IsEmptyElement = false;
  255. Attributes.clear();
  256. // find name
  257. const char_type* startName = P;
  258. // find end of element
  259. while(*P != L'>' && !isWhiteSpace(*P))
  260. ++P;
  261. const char_type* endName = P;
  262. // find Attributes
  263. while(*P != L'>')
  264. {
  265. if (isWhiteSpace(*P))
  266. ++P;
  267. else
  268. {
  269. if (*P != L'/')
  270. {
  271. // we've got an attribute
  272. // read the attribute names
  273. const char_type* attributeNameBegin = P;
  274. while(!isWhiteSpace(*P) && *P != L'=')
  275. ++P;
  276. const char_type* attributeNameEnd = P;
  277. ++P;
  278. // read the attribute value
  279. // check for quotes and single quotes, thx to murphy
  280. while( (*P != L'\"') && (*P != L'\'') && *P)
  281. ++P;
  282. if (!*P) // malformatted xml file
  283. return;
  284. const char_type attributeQuoteChar = *P;
  285. ++P;
  286. const char_type* attributeValueBegin = P;
  287. while(*P != attributeQuoteChar && *P)
  288. ++P;
  289. if (!*P) // malformatted xml file
  290. return;
  291. const char_type* attributeValueEnd = P;
  292. ++P;
  293. SAttribute attr;
  294. attr.Name = core::string<char_type>(attributeNameBegin,
  295. (int)(attributeNameEnd - attributeNameBegin));
  296. core::string<char_type> s(attributeValueBegin,
  297. (int)(attributeValueEnd - attributeValueBegin));
  298. attr.Value = replaceSpecialCharacters(s);
  299. Attributes.push_back(attr);
  300. }
  301. else
  302. {
  303. // tag is closed directly
  304. ++P;
  305. IsEmptyElement = true;
  306. break;
  307. }
  308. }
  309. }
  310. // check if this tag is closing directly
  311. if (endName > startName && *(endName-1) == L'/')
  312. {
  313. // directly closing tag
  314. IsEmptyElement = true;
  315. endName--;
  316. }
  317. NodeName = core::string<char_type>(startName, (int)(endName - startName));
  318. ++P;
  319. }
  320. //! parses an closing xml tag
  321. void parseClosingXMLElement()
  322. {
  323. CurrentNodeType = EXN_ELEMENT_END;
  324. IsEmptyElement = false;
  325. Attributes.clear();
  326. ++P;
  327. const char_type* pBeginClose = P;
  328. while(*P != L'>')
  329. ++P;
  330. NodeName = core::string<char_type>(pBeginClose, (int)(P - pBeginClose));
  331. ++P;
  332. }
  333. //! parses a possible CDATA section, returns false if begin was not a CDATA section
  334. bool parseCDATA()
  335. {
  336. if (*(P+1) != L'[')
  337. return false;
  338. CurrentNodeType = EXN_CDATA;
  339. // skip '<![CDATA['
  340. int count=0;
  341. while( *P && count<8 )
  342. {
  343. ++P;
  344. ++count;
  345. }
  346. if (!*P)
  347. return true;
  348. char_type *cDataBegin = P;
  349. char_type *cDataEnd = 0;
  350. // find end of CDATA
  351. while(*P && !cDataEnd)
  352. {
  353. if (*P == L'>' &&
  354. (*(P-1) == L']') &&
  355. (*(P-2) == L']'))
  356. {
  357. cDataEnd = P - 2;
  358. }
  359. ++P;
  360. }
  361. if ( cDataEnd )
  362. NodeName = core::string<char_type>(cDataBegin, (int)(cDataEnd - cDataBegin));
  363. else
  364. NodeName = "";
  365. return true;
  366. }
  367. // structure for storing attribute-name pairs
  368. struct SAttribute
  369. {
  370. core::string<char_type> Name;
  371. core::string<char_type> Value;
  372. };
  373. // finds a current attribute by name, returns 0 if not found
  374. const SAttribute* getAttributeByName(const char_type* name) const
  375. {
  376. if (!name)
  377. return 0;
  378. core::string<char_type> n = name;
  379. for (int i=0; i<(int)Attributes.size(); ++i)
  380. if (Attributes[i].Name == n)
  381. return &Attributes[i];
  382. return 0;
  383. }
  384. // replaces xml special characters in a string and creates a new one
  385. core::string<char_type> replaceSpecialCharacters(
  386. core::string<char_type>& origstr)
  387. {
  388. int pos = origstr.findFirst(L'&');
  389. int oldPos = 0;
  390. if (pos == -1)
  391. return origstr;
  392. core::string<char_type> newstr;
  393. while(pos != -1 && pos < (int)origstr.size()-2)
  394. {
  395. // check if it is one of the special characters
  396. int specialChar = -1;
  397. for (int i=0; i<(int)SpecialCharacters.size(); ++i)
  398. {
  399. const char_type* p = &origstr.c_str()[pos]+1;
  400. if (equalsn(&SpecialCharacters[i][1], p, SpecialCharacters[i].size()-1))
  401. {
  402. specialChar = i;
  403. break;
  404. }
  405. }
  406. if (specialChar != -1)
  407. {
  408. newstr.append(origstr.subString(oldPos, pos - oldPos));
  409. newstr.append(SpecialCharacters[specialChar][0]);
  410. pos += SpecialCharacters[specialChar].size();
  411. }
  412. else
  413. {
  414. newstr.append(origstr.subString(oldPos, pos - oldPos + 1));
  415. pos += 1;
  416. }
  417. // find next &
  418. oldPos = pos;
  419. pos = origstr.findNext(L'&', pos);
  420. }
  421. if (oldPos < (int)origstr.size()-1)
  422. newstr.append(origstr.subString(oldPos, origstr.size()-oldPos));
  423. return newstr;
  424. }
  425. //! reads the xml file and converts it into the wanted character format.
  426. bool readFile(IFileReadCallBack* callback)
  427. {
  428. long size = callback->getSize();
  429. if (size<0)
  430. return false;
  431. size += 4; // We need four terminating 0's at the end.
  432. // For ASCII we need 1 0's, for UTF-16 2, for UTF-32 4.
  433. char* data8 = new char[size];
  434. if (!callback->read(data8, size-4))
  435. {
  436. delete [] data8;
  437. return false;
  438. }
  439. // add zeros at end
  440. memset(data8+size-4, 0, 4);
  441. char16* data16 = reinterpret_cast<char16*>(data8);
  442. char32* data32 = reinterpret_cast<char32*>(data8);
  443. // now we need to convert the data to the desired target format
  444. // based on the byte order mark.
  445. const unsigned char UTF8[] = {0xEF, 0xBB, 0xBF}; // 0xEFBBBF;
  446. const u16 UTF16_BE = 0xFFFE;
  447. const u16 UTF16_LE = 0xFEFF;
  448. const u32 UTF32_BE = 0xFFFE0000;
  449. const u32 UTF32_LE = 0x0000FEFF;
  450. // check source for all utf versions and convert to target data format
  451. if (size >= 4 && data32[0] == static_cast<char32>(UTF32_BE))
  452. {
  453. // UTF-32, big endian
  454. SourceFormat = ETF_UTF32_BE;
  455. convertTextData(data32+1, data8, (size/4)-1); // data32+1 because we need to skip the header
  456. }
  457. else
  458. if (size >= 4 && data32[0] == static_cast<char32>(UTF32_LE))
  459. {
  460. // UTF-32, little endian
  461. SourceFormat = ETF_UTF32_LE;
  462. convertTextData(data32+1, data8, (size/4)-1); // data32+1 because we need to skip the header
  463. }
  464. else
  465. if (size >= 2 && data16[0] == UTF16_BE)
  466. {
  467. // UTF-16, big endian
  468. SourceFormat = ETF_UTF16_BE;
  469. convertTextData(data16+1, data8, (size/2)-1); // data16+1 because we need to skip the header
  470. }
  471. else
  472. if (size >= 2 && data16[0] == UTF16_LE)
  473. {
  474. // UTF-16, little endian
  475. SourceFormat = ETF_UTF16_LE;
  476. convertTextData(data16+1, data8, (size/2)-1); // data16+1 because we need to skip the header
  477. }
  478. else
  479. if (size >= 3 && memcmp(data8,UTF8,3)==0)
  480. {
  481. // UTF-8
  482. SourceFormat = ETF_UTF8;
  483. convertTextData(data8+3, data8, size-3); // data8+3 because we need to skip the header
  484. }
  485. else
  486. {
  487. // ASCII
  488. SourceFormat = ETF_ASCII;
  489. convertTextData(data8, data8, size);
  490. }
  491. return true;
  492. }
  493. //! converts the text file into the desired format.
  494. /** \param source: begin of the text (without byte order mark)
  495. \param pointerToStore: pointer to text data block which can be
  496. stored or deleted based on the nesessary conversion.
  497. \param sizeWithoutHeader: Text size in characters without header
  498. */
  499. template<class src_char_type>
  500. void convertTextData(src_char_type* source, char* pointerToStore, int sizeWithoutHeader)
  501. {
  502. // convert little to big endian if necessary
  503. if (sizeof(src_char_type) > 1 &&
  504. isLittleEndian(TargetFormat) != isLittleEndian(SourceFormat))
  505. convertToLittleEndian(source);
  506. // check if conversion is necessary:
  507. if (sizeof(src_char_type) == sizeof(char_type))
  508. {
  509. // no need to convert
  510. TextBegin = (char_type*)source;
  511. TextData = (char_type*)pointerToStore;
  512. TextSize = sizeWithoutHeader;
  513. }
  514. else
  515. {
  516. // convert source into target data format.
  517. // TODO: implement a real conversion. This one just
  518. // copies bytes. This is a problem when there are
  519. // unicode symbols using more than one character.
  520. TextData = new char_type[sizeWithoutHeader];
  521. if ( sizeof(src_char_type) == 1 )
  522. {
  523. // we have to cast away negative numbers or results might add the sign instead of just doing a copy
  524. for (int i=0; i<sizeWithoutHeader; ++i)
  525. {
  526. TextData[i] = static_cast<char_type>(static_cast<unsigned char>(source[i]));
  527. }
  528. }
  529. else
  530. {
  531. for (int i=0; i<sizeWithoutHeader; ++i)
  532. TextData[i] = static_cast<char_type>(source[i]);
  533. }
  534. TextBegin = TextData;
  535. TextSize = sizeWithoutHeader;
  536. // delete original data because no longer needed
  537. delete [] pointerToStore;
  538. }
  539. }
  540. //! converts whole text buffer to little endian
  541. template<class src_char_type>
  542. void convertToLittleEndian(src_char_type* t)
  543. {
  544. if (sizeof(src_char_type) == 4)
  545. {
  546. // 32 bit
  547. while(*t)
  548. {
  549. *t = ((*t & 0xff000000) >> 24) |
  550. ((*t & 0x00ff0000) >> 8) |
  551. ((*t & 0x0000ff00) << 8) |
  552. ((*t & 0x000000ff) << 24);
  553. ++t;
  554. }
  555. }
  556. else
  557. {
  558. // 16 bit
  559. while(*t)
  560. {
  561. *t = (*t >> 8) | (*t << 8);
  562. ++t;
  563. }
  564. }
  565. }
  566. //! returns if a format is little endian
  567. inline bool isLittleEndian(ETEXT_FORMAT f)
  568. {
  569. return f == ETF_ASCII ||
  570. f == ETF_UTF8 ||
  571. f == ETF_UTF16_LE ||
  572. f == ETF_UTF32_LE;
  573. }
  574. //! returns true if a character is whitespace
  575. inline bool isWhiteSpace(char_type c)
  576. {
  577. return (c==' ' || c=='\t' || c=='\n' || c=='\r');
  578. }
  579. //! generates a list with xml special characters
  580. void createSpecialCharacterList()
  581. {
  582. // list of strings containing special symbols,
  583. // the first character is the special character,
  584. // the following is the symbol string without trailing &.
  585. SpecialCharacters.push_back("&amp;");
  586. SpecialCharacters.push_back("<lt;");
  587. SpecialCharacters.push_back(">gt;");
  588. SpecialCharacters.push_back("\"quot;");
  589. SpecialCharacters.push_back("'apos;");
  590. }
  591. //! compares the first n characters of the strings
  592. bool equalsn(const char_type* str1, const char_type* str2, int len)
  593. {
  594. int i;
  595. for(i=0; str1[i] && str2[i] && i < len; ++i)
  596. if (str1[i] != str2[i])
  597. return false;
  598. // if one (or both) of the strings was smaller then they
  599. // are only equal if they have the same lenght
  600. return (i == len) || (str1[i] == 0 && str2[i] == 0);
  601. }
  602. //! stores the target text format
  603. void storeTargetFormat()
  604. {
  605. // get target format. We could have done this using template specialization,
  606. // but VisualStudio 6 don't like it and we want to support it.
  607. switch(sizeof(char_type))
  608. {
  609. case 1:
  610. TargetFormat = ETF_UTF8;
  611. break;
  612. case 2:
  613. TargetFormat = ETF_UTF16_LE;
  614. break;
  615. case 4:
  616. TargetFormat = ETF_UTF32_LE;
  617. break;
  618. default:
  619. TargetFormat = ETF_ASCII; // should never happen.
  620. }
  621. }
  622. // instance variables:
  623. bool IgnoreWhitespaceText; // do not return EXN_TEXT nodes for pure whitespace
  624. char_type* TextData; // data block of the text file
  625. char_type* P; // current point in text to parse
  626. char_type* TextBegin; // start of text to parse
  627. unsigned int TextSize; // size of text to parse in characters, not bytes
  628. EXML_NODE CurrentNodeType; // type of the currently parsed node
  629. ETEXT_FORMAT SourceFormat; // source format of the xml file
  630. ETEXT_FORMAT TargetFormat; // output format of this parser
  631. core::string<char_type> NodeName; // name of the node currently in - also used for text
  632. core::string<char_type> EmptyString; // empty string to be returned by getSafe() methods
  633. bool IsEmptyElement; // is the currently parsed node empty?
  634. core::array< core::string<char_type> > SpecialCharacters; // see createSpecialCharacterList()
  635. core::array<SAttribute> Attributes; // attributes of current element
  636. }; // end CXMLReaderImpl
  637. } // end namespace
  638. } // end namespace
  639. #endif