XmlParser.java 159 KB


  1. /* XmlParser.java --
  2. Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
  3. This file is part of GNU Classpath.
  4. GNU Classpath is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2, or (at your option)
  7. any later version.
  8. GNU Classpath is distributed in the hope that it will be useful, but
  9. WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with GNU Classpath; see the file COPYING. If not, write to the
  14. Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  15. 02110-1301 USA.
  16. Linking this library statically or dynamically with other modules is
  17. making a combined work based on this library. Thus, the terms and
  18. conditions of the GNU General Public License cover the whole
  19. combination.
  20. As a special exception, the copyright holders of this library give you
  21. permission to link this library with independent modules to produce an
  22. executable, regardless of the license terms of these independent
  23. modules, and to copy and distribute the resulting executable under
  24. terms of your choice, provided that you also meet, for each linked
  25. independent module, the terms and conditions of the license of that
  26. module. An independent module is a module which is not derived from
  27. or based on this library. If you modify this library, you may extend
  28. this exception to your version of the library, but you are not
  29. obligated to do so. If you do not wish to do so, delete this
  30. exception statement from your version.
  31. Partly derived from code which carried the following notice:
  32. Copyright (c) 1997, 1998 by Microstar Software Ltd.
  33. AElfred is free for both commercial and non-commercial use and
  34. redistribution, provided that Microstar's copyright and disclaimer are
  35. retained intact. You are free to modify AElfred for your own use and
  36. to redistribute AElfred with your modifications, provided that the
  37. modifications are clearly documented.
  38. This program is distributed in the hope that it will be useful, but
  39. WITHOUT ANY WARRANTY; without even the implied warranty of
  40. merchantability or fitness for a particular purpose. Please use it AT
  41. YOUR OWN RISK.
  42. */
  43. package gnu.xml.aelfred2;
  44. import gnu.java.security.action.GetPropertyAction;
  45. import java.io.BufferedInputStream;
  46. import java.io.CharConversionException;
  47. import java.io.EOFException;
  48. import java.io.InputStream;
  49. import java.io.InputStreamReader;
  50. import java.io.IOException;
  51. import java.io.Reader;
  52. import java.io.UnsupportedEncodingException;
  53. import java.net.URL;
  54. import java.net.URLConnection;
  55. import java.security.AccessController;
  56. import java.util.Iterator;
  57. import java.util.HashMap;
  58. import java.util.LinkedList;
  59. import org.xml.sax.InputSource;
  60. import org.xml.sax.SAXException;
  61. /**
  62. * Parse XML documents and return parse events through call-backs.
  63. * Use the <code>SAXDriver</code> class as your entry point, as all
  64. * internal parser interfaces are subject to change.
  65. *
  66. * @author Written by David Megginson &lt;dmeggins@microstar.com&gt;
  67. * (version 1.2a with bugfixes)
  68. * @author Updated by David Brownell &lt;dbrownell@users.sourceforge.net&gt;
  69. * @see SAXDriver
  70. */
  71. final class XmlParser
  72. {
  73. // avoid slow per-character readCh()
  74. private final static boolean USE_CHEATS = true;
  75. ////////////////////////////////////////////////////////////////////////
  76. // Constants.
  77. ////////////////////////////////////////////////////////////////////////
  78. //
  79. // Constants for element content type.
  80. //
  81. /**
  82. * Constant: an element has not been declared.
  83. * @see #getElementContentType
  84. */
  85. public final static int CONTENT_UNDECLARED = 0;
  86. /**
  87. * Constant: the element has a content model of ANY.
  88. * @see #getElementContentType
  89. */
  90. public final static int CONTENT_ANY = 1;
  91. /**
  92. * Constant: the element has declared content of EMPTY.
  93. * @see #getElementContentType
  94. */
  95. public final static int CONTENT_EMPTY = 2;
  96. /**
  97. * Constant: the element has mixed content.
  98. * @see #getElementContentType
  99. */
  100. public final static int CONTENT_MIXED = 3;
  101. /**
  102. * Constant: the element has element content.
  103. * @see #getElementContentType
  104. */
  105. public final static int CONTENT_ELEMENTS = 4;
  106. //
  107. // Constants for the entity type.
  108. //
  109. /**
  110. * Constant: the entity has not been declared.
  111. * @see #getEntityType
  112. */
  113. public final static int ENTITY_UNDECLARED = 0;
  114. /**
  115. * Constant: the entity is internal.
  116. * @see #getEntityType
  117. */
  118. public final static int ENTITY_INTERNAL = 1;
  119. /**
  120. * Constant: the entity is external, non-parsable data.
  121. * @see #getEntityType
  122. */
  123. public final static int ENTITY_NDATA = 2;
  124. /**
  125. * Constant: the entity is external XML data.
  126. * @see #getEntityType
  127. */
  128. public final static int ENTITY_TEXT = 3;
  129. //
  130. // Attribute type constants are interned literal strings.
  131. //
  132. //
  133. // Constants for supported encodings. "external" is just a flag.
  134. //
  135. private final static int ENCODING_EXTERNAL = 0;
  136. private final static int ENCODING_UTF_8 = 1;
  137. private final static int ENCODING_ISO_8859_1 = 2;
  138. private final static int ENCODING_UCS_2_12 = 3;
  139. private final static int ENCODING_UCS_2_21 = 4;
  140. private final static int ENCODING_UCS_4_1234 = 5;
  141. private final static int ENCODING_UCS_4_4321 = 6;
  142. private final static int ENCODING_UCS_4_2143 = 7;
  143. private final static int ENCODING_UCS_4_3412 = 8;
  144. private final static int ENCODING_ASCII = 9;
  145. //
  146. // Constants for attribute default value.
  147. //
  148. /**
  149. * Constant: the attribute is not declared.
  150. * @see #getAttributeDefaultValueType
  151. */
  152. public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30;
  153. /**
  154. * Constant: the attribute has a literal default value specified.
  155. * @see #getAttributeDefaultValueType
  156. * @see #getAttributeDefaultValue
  157. */
  158. public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31;
  159. /**
  160. * Constant: the attribute was declared #IMPLIED.
  161. * @see #getAttributeDefaultValueType
  162. */
  163. public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32;
  164. /**
  165. * Constant: the attribute was declared #REQUIRED.
  166. * @see #getAttributeDefaultValueType
  167. */
  168. public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33;
  169. /**
  170. * Constant: the attribute was declared #FIXED.
  171. * @see #getAttributeDefaultValueType
  172. * @see #getAttributeDefaultValue
  173. */
  174. public final static int ATTRIBUTE_DEFAULT_FIXED = 34;
  175. //
  176. // Constants for input.
  177. //
  178. private final static int INPUT_NONE = 0;
  179. private final static int INPUT_INTERNAL = 1;
  180. private final static int INPUT_STREAM = 3;
  181. private final static int INPUT_READER = 5;
  182. //
  183. // Flags for reading literals.
  184. //
  185. // expand general entity refs (attribute values in dtd and content)
  186. private final static int LIT_ENTITY_REF = 2;
  187. // normalize this value (space chars) (attributes, public ids)
  188. private final static int LIT_NORMALIZE = 4;
  189. // literal is an attribute value
  190. private final static int LIT_ATTRIBUTE = 8;
  191. // don't expand parameter entities
  192. private final static int LIT_DISABLE_PE = 16;
  193. // don't expand [or parse] character refs
  194. private final static int LIT_DISABLE_CREF = 32;
  195. // don't parse general entity refs
  196. private final static int LIT_DISABLE_EREF = 64;
  197. // literal is a public ID value
  198. private final static int LIT_PUBID = 256;
  199. //
  200. // Flags affecting PE handling in DTDs (if expandPE is true).
  201. // PEs expand with space padding, except inside literals.
  202. //
  203. private final static int CONTEXT_NORMAL = 0;
  204. private final static int CONTEXT_LITERAL = 1;
  205. // Emit warnings for relative URIs with no base URI.
  206. static boolean uriWarnings;
  207. static
  208. {
  209. String key = "gnu.xml.aelfred2.XmlParser.uriWarnings";
  210. GetPropertyAction a = new GetPropertyAction(key);
  211. uriWarnings = "true".equals(AccessController.doPrivileged(a));
  212. }
  213. //
  214. // The current XML handler interface.
  215. //
  216. private SAXDriver handler;
  217. //
  218. // I/O information.
  219. //
  220. private Reader reader; // current reader
  221. private InputStream is; // current input stream
  222. private int line; // current line number
  223. private int column; // current column number
  224. private int sourceType; // type of input source
  225. private LinkedList inputStack; // stack of input soruces
  226. private URLConnection externalEntity; // current external entity
  227. private int encoding; // current character encoding
  228. private int currentByteCount; // bytes read from current source
  229. private InputSource scratch; // temporary
  230. //
  231. // Buffers for decoded but unparsed character input.
  232. //
  233. private char[] readBuffer;
  234. private int readBufferPos;
  235. private int readBufferLength;
  236. private int readBufferOverflow; // overflow from last data chunk.
  237. //
  238. // Buffer for undecoded raw byte input.
  239. //
  240. private final static int READ_BUFFER_MAX = 16384;
  241. private byte[] rawReadBuffer;
  242. //
  243. // Buffer for attribute values, char refs, DTD stuff.
  244. //
  245. private static int DATA_BUFFER_INITIAL = 4096;
  246. private char[] dataBuffer;
  247. private int dataBufferPos;
  248. //
  249. // Buffer for parsed names.
  250. //
  251. private static int NAME_BUFFER_INITIAL = 1024;
  252. private char[] nameBuffer;
  253. private int nameBufferPos;
  254. //
  255. // Save any standalone flag
  256. //
  257. private boolean docIsStandalone;
  258. //
  259. // Hashtables for DTD information on elements, entities, and notations.
  260. // Populated until we start ignoring decls (because of skipping a PE)
  261. //
  262. private HashMap elementInfo;
  263. private HashMap entityInfo;
  264. private HashMap notationInfo;
  265. private boolean skippedPE;
  266. //
  267. // Element type currently in force.
  268. //
  269. private String currentElement;
  270. private int currentElementContent;
  271. //
  272. // Stack of entity names, to detect recursion.
  273. //
  274. private LinkedList entityStack;
  275. //
  276. // PE expansion is enabled in most chunks of the DTD, not all.
  277. // When it's enabled, literals are treated differently.
  278. //
  279. private boolean inLiteral;
  280. private boolean expandPE;
  281. private boolean peIsError;
  282. //
  283. // can't report entity expansion inside two constructs:
  284. // - attribute expansions (internal entities only)
  285. // - markup declarations (parameter entities only)
  286. //
  287. private boolean doReport;
  288. //
  289. // Symbol table, for caching interned names.
  290. //
  291. // These show up wherever XML names or nmtokens are used: naming elements,
  292. // attributes, PIs, notations, entities, and enumerated attribute values.
  293. //
  294. // NOTE: This hashtable doesn't grow. The default size is intended to be
  295. // rather large for most documents. Example: one snapshot of the DocBook
  296. // XML 4.1 DTD used only about 350 such names. As a rule, only pathological
  297. // documents (ones that don't reuse names) should ever see much collision.
  298. //
  299. // Be sure that SYMBOL_TABLE_LENGTH always stays prime, for best hashing.
  300. // "2039" keeps the hash table size at about two memory pages on typical
  301. // 32 bit hardware.
  302. //
  303. private final static int SYMBOL_TABLE_LENGTH = 2039;
  304. private Object[][] symbolTable;
  305. //
  306. // Hash table of attributes found in current start tag.
  307. //
  308. private String[] tagAttributes;
  309. private int tagAttributePos;
  310. //
  311. // Utility flag: have we noticed a CR while reading the last
  312. // data chunk? If so, we will have to go back and normalise
  313. // CR or CR/LF line ends.
  314. //
  315. private boolean sawCR;
  316. //
  317. // Utility flag: are we in CDATA? If so, whitespace isn't ignorable.
  318. //
  319. private boolean inCDATA;
  320. //
  321. // Xml version.
  322. //
  323. private static final int XML_10 = 0;
  324. private static final int XML_11 = 1;
  325. private int xmlVersion = XML_10;
  326. //////////////////////////////////////////////////////////////////////
  327. // Constructors.
  328. ////////////////////////////////////////////////////////////////////////
  329. /**
  330. * Construct a new parser with no associated handler.
  331. * @see #setHandler
  332. * @see #parse
  333. */
  334. // package private
  335. XmlParser()
  336. {
  337. }
  338. /**
  339. * Set the handler that will receive parsing events.
  340. * @param handler The handler to receive callback events.
  341. * @see #parse
  342. */
  343. // package private
  344. void setHandler(SAXDriver handler)
  345. {
  346. this.handler = handler;
  347. }
  348. /**
  349. * Parse an XML document from the character stream, byte stream, or URI
  350. * that you provide (in that order of preference). Any URI that you
  351. * supply will become the base URI for resolving relative URI, and may
  352. * be used to acquire a reader or byte stream.
  353. *
  354. * <p> Only one thread at a time may use this parser; since it is
  355. * private to this package, post-parse cleanup is done by the caller,
  356. * which MUST NOT REUSE the parser (just null it).
  357. *
  358. * @param systemId Absolute URI of the document; should never be null,
  359. * but may be so iff a reader <em>or</em> a stream is provided.
  360. * @param publicId The public identifier of the document, or null.
  361. * @param reader A character stream; must be null if stream isn't.
  362. * @param stream A byte input stream; must be null if reader isn't.
  363. * @param encoding The suggested encoding, or null if unknown.
  364. * @exception java.lang.Exception Basically SAXException or IOException
  365. */
  366. // package private
  367. void doParse(String systemId, String publicId, Reader reader,
  368. InputStream stream, String encoding)
  369. throws Exception
  370. {
  371. if (handler == null)
  372. {
  373. throw new IllegalStateException("no callback handler");
  374. }
  375. initializeVariables();
  376. // predeclare the built-in entities here (replacement texts)
  377. // we don't need to intern(), since we're guaranteed literals
  378. // are always (globally) interned.
  379. setInternalEntity("amp", "&#38;");
  380. setInternalEntity("lt", "&#60;");
  381. setInternalEntity("gt", "&#62;");
  382. setInternalEntity("apos", "&#39;");
  383. setInternalEntity("quot", "&#34;");
  384. try
  385. {
  386. // pushURL first to ensure locator is correct in startDocument
  387. // ... it might report an IO or encoding exception.
  388. handler.startDocument();
  389. pushURL(false, "[document]",
  390. // default baseURI: null
  391. new ExternalIdentifiers(publicId, systemId, null),
  392. reader, stream, encoding, false);
  393. parseDocument();
  394. }
  395. catch (EOFException e)
  396. {
  397. //empty input
  398. error("empty document, with no root element.");
  399. }
  400. finally
  401. {
  402. if (reader != null)
  403. {
  404. try
  405. {
  406. reader.close();
  407. }
  408. catch (IOException e)
  409. {
  410. /* ignore */
  411. }
  412. }
  413. if (stream != null)
  414. {
  415. try
  416. {
  417. stream.close();
  418. }
  419. catch (IOException e)
  420. {
  421. /* ignore */
  422. }
  423. }
  424. if (is != null)
  425. {
  426. try
  427. {
  428. is.close();
  429. }
  430. catch (IOException e)
  431. {
  432. /* ignore */
  433. }
  434. }
  435. scratch = null;
  436. }
  437. }
  438. //////////////////////////////////////////////////////////////////////
  439. // Error reporting.
  440. //////////////////////////////////////////////////////////////////////
  441. /**
  442. * Report an error.
  443. * @param message The error message.
  444. * @param textFound The text that caused the error (or null).
  445. * @see SAXDriver#error
  446. * @see #line
  447. */
  448. private void error(String message, String textFound, String textExpected)
  449. throws SAXException
  450. {
  451. if (textFound != null)
  452. {
  453. message = message + " (found \"" + textFound + "\")";
  454. }
  455. if (textExpected != null)
  456. {
  457. message = message + " (expected \"" + textExpected + "\")";
  458. }
  459. handler.fatal(message);
  460. // "can't happen"
  461. throw new SAXException(message);
  462. }
  463. /**
  464. * Report a serious error.
  465. * @param message The error message.
  466. * @param textFound The text that caused the error (or null).
  467. */
  468. private void error(String message, char textFound, String textExpected)
  469. throws SAXException
  470. {
  471. error(message, Character.toString(textFound), textExpected);
  472. }
  473. /**
  474. * Report typical case fatal errors.
  475. */
  476. private void error(String message)
  477. throws SAXException
  478. {
  479. handler.fatal(message);
  480. }
  481. //////////////////////////////////////////////////////////////////////
  482. // Major syntactic productions.
  483. //////////////////////////////////////////////////////////////////////
  484. /**
  485. * Parse an XML document.
  486. * <pre>
  487. * [1] document ::= prolog element Misc*
  488. * </pre>
  489. * <p>This is the top-level parsing function for a single XML
  490. * document. As a minimum, a well-formed document must have
  491. * a document element, and a valid document must have a prolog
  492. * (one with doctype) as well.
  493. */
  494. private void parseDocument()
  495. throws Exception
  496. {
  497. try
  498. { // added by MHK
  499. boolean sawDTD = parseProlog();
  500. require('<');
  501. parseElement(!sawDTD);
  502. }
  503. catch (EOFException ee)
  504. { // added by MHK
  505. error("premature end of file", "[EOF]", null);
  506. }
  507. try
  508. {
  509. parseMisc(); //skip all white, PIs, and comments
  510. char c = readCh(); //if this doesn't throw an exception...
  511. error("unexpected characters after document end", c, null);
  512. }
  513. catch (EOFException e)
  514. {
  515. return;
  516. }
  517. }
  518. static final char[] startDelimComment = { '<', '!', '-', '-' };
  519. static final char[] endDelimComment = { '-', '-' };
  520. /**
  521. * Skip a comment.
  522. * <pre>
  523. * [15] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
  524. * </pre>
  525. * <p> (The <code>&lt;!--</code> has already been read.)
  526. */
  527. private void parseComment()
  528. throws Exception
  529. {
  530. char c;
  531. boolean saved = expandPE;
  532. expandPE = false;
  533. parseUntil(endDelimComment);
  534. require('>');
  535. expandPE = saved;
  536. handler.comment(dataBuffer, 0, dataBufferPos);
  537. dataBufferPos = 0;
  538. }
  539. static final char[] startDelimPI = { '<', '?' };
  540. static final char[] endDelimPI = { '?', '>' };
  541. /**
  542. * Parse a processing instruction and do a call-back.
  543. * <pre>
  544. * [16] PI ::= '&lt;?' PITarget
  545. * (S (Char* - (Char* '?&gt;' Char*)))?
  546. * '?&gt;'
  547. * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
  548. * </pre>
  549. * <p> (The <code>&lt;?</code> has already been read.)
  550. */
  551. private void parsePI()
  552. throws SAXException, IOException
  553. {
  554. String name;
  555. boolean saved = expandPE;
  556. expandPE = false;
  557. name = readNmtoken(true);
  558. //NE08
  559. if (name.indexOf(':') >= 0)
  560. {
  561. error("Illegal character(':') in processing instruction name ",
  562. name, null);
  563. }
  564. if ("xml".equalsIgnoreCase(name))
  565. {
  566. error("Illegal processing instruction target", name, null);
  567. }
  568. if (!tryRead(endDelimPI))
  569. {
  570. requireWhitespace();
  571. parseUntil(endDelimPI);
  572. }
  573. expandPE = saved;
  574. handler.processingInstruction(name, dataBufferToString());
  575. }
  576. static final char[] endDelimCDATA = { ']', ']', '>' };
  577. private boolean isDirtyCurrentElement;
  578. /**
  579. * Parse a CDATA section.
  580. * <pre>
  581. * [18] CDSect ::= CDStart CData CDEnd
  582. * [19] CDStart ::= '&lt;![CDATA['
  583. * [20] CData ::= (Char* - (Char* ']]&gt;' Char*))
  584. * [21] CDEnd ::= ']]&gt;'
  585. * </pre>
  586. * <p> (The '&lt;![CDATA[' has already been read.)
  587. */
  588. private void parseCDSect()
  589. throws Exception
  590. {
  591. parseUntil(endDelimCDATA);
  592. dataBufferFlush();
  593. }
  594. /**
  595. * Parse the prolog of an XML document.
  596. * <pre>
  597. * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
  598. * </pre>
  599. * <p>We do not look for the XML declaration here, because it was
  600. * handled by pushURL ().
  601. * @see pushURL
  602. * @return true if a DTD was read.
  603. */
  604. private boolean parseProlog()
  605. throws Exception
  606. {
  607. parseMisc();
  608. if (tryRead("<!DOCTYPE"))
  609. {
  610. parseDoctypedecl();
  611. parseMisc();
  612. return true;
  613. }
  614. return false;
  615. }
  616. private void checkLegalVersion(String version)
  617. throws SAXException
  618. {
  619. int len = version.length();
  620. for (int i = 0; i < len; i++)
  621. {
  622. char c = version.charAt(i);
  623. if ('0' <= c && c <= '9')
  624. {
  625. continue;
  626. }
  627. if (c == '_' || c == '.' || c == ':' || c == '-')
  628. {
  629. continue;
  630. }
  631. if ('a' <= c && c <= 'z')
  632. {
  633. continue;
  634. }
  635. if ('A' <= c && c <= 'Z')
  636. {
  637. continue;
  638. }
  639. error ("illegal character in version", version, "1.0");
  640. }
  641. }
  642. /**
  643. * Parse the XML declaration.
  644. * <pre>
  645. * [23] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
  646. * [24] VersionInfo ::= S 'version' Eq
  647. * ("'" VersionNum "'" | '"' VersionNum '"' )
  648. * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')*
  649. * [32] SDDecl ::= S 'standalone' Eq
  650. * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' )
  651. * [80] EncodingDecl ::= S 'encoding' Eq
  652. * ( "'" EncName "'" | "'" EncName "'" )
  653. * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
  654. * </pre>
  655. * <p> (The <code>&lt;?xml</code> and whitespace have already been read.)
  656. * @return the encoding in the declaration, uppercased; or null
  657. * @see #parseTextDecl
  658. * @see #setupDecoding
  659. */
  660. private String parseXMLDecl(boolean ignoreEncoding)
  661. throws SAXException, IOException
  662. {
  663. String version;
  664. String encodingName = null;
  665. String standalone = null;
  666. int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
  667. String inputEncoding = null;
  668. switch (this.encoding)
  669. {
  670. case ENCODING_EXTERNAL:
  671. case ENCODING_UTF_8:
  672. inputEncoding = "UTF-8";
  673. break;
  674. case ENCODING_ISO_8859_1:
  675. inputEncoding = "ISO-8859-1";
  676. break;
  677. case ENCODING_UCS_2_12:
  678. inputEncoding = "UTF-16BE";
  679. break;
  680. case ENCODING_UCS_2_21:
  681. inputEncoding = "UTF-16LE";
  682. break;
  683. }
  684. // Read the version.
  685. require("version");
  686. parseEq();
  687. checkLegalVersion(version = readLiteral(flags));
  688. if (!version.equals("1.0"))
  689. {
  690. if (version.equals("1.1"))
  691. {
  692. handler.warn("expected XML version 1.0, not: " + version);
  693. xmlVersion = XML_11;
  694. }
  695. else
  696. {
  697. error("illegal XML version", version, "1.0 or 1.1");
  698. }
  699. }
  700. else
  701. {
  702. xmlVersion = XML_10;
  703. }
  704. // Try reading an encoding declaration.
  705. boolean white = tryWhitespace();
  706. if (tryRead("encoding"))
  707. {
  708. if (!white)
  709. {
  710. error("whitespace required before 'encoding='");
  711. }
  712. parseEq();
  713. encodingName = readLiteral(flags);
  714. if (!ignoreEncoding)
  715. {
  716. setupDecoding(encodingName);
  717. }
  718. }
  719. // Try reading a standalone declaration
  720. if (encodingName != null)
  721. {
  722. white = tryWhitespace();
  723. }
  724. if (tryRead("standalone"))
  725. {
  726. if (!white)
  727. {
  728. error("whitespace required before 'standalone='");
  729. }
  730. parseEq();
  731. standalone = readLiteral(flags);
  732. if ("yes".equals(standalone))
  733. {
  734. docIsStandalone = true;
  735. }
  736. else if (!"no".equals(standalone))
  737. {
  738. error("standalone flag must be 'yes' or 'no'");
  739. }
  740. }
  741. skipWhitespace();
  742. require("?>");
  743. if (inputEncoding == null)
  744. {
  745. inputEncoding = encodingName;
  746. }
  747. return encodingName;
  748. }
  749. /**
  750. * Parse a text declaration.
  751. * <pre>
  752. * [79] TextDecl ::= '&lt;?xml' VersionInfo? EncodingDecl S? '?&gt;'
  753. * [80] EncodingDecl ::= S 'encoding' Eq
  754. * ( '"' EncName '"' | "'" EncName "'" )
  755. * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
  756. * </pre>
  757. * <p> (The <code>&lt;?xml</code>' and whitespace have already been read.)
  758. * @return the encoding in the declaration, uppercased; or null
  759. * @see #parseXMLDecl
  760. * @see #setupDecoding
  761. */
  762. private String parseTextDecl(boolean ignoreEncoding)
  763. throws SAXException, IOException
  764. {
  765. String encodingName = null;
  766. int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
  767. // Read an optional version.
  768. if (tryRead ("version"))
  769. {
  770. String version;
  771. parseEq();
  772. checkLegalVersion(version = readLiteral(flags));
  773. if (version.equals("1.1"))
  774. {
  775. if (xmlVersion == XML_10)
  776. {
  777. error("external subset has later version number.", "1.0",
  778. version);
  779. }
  780. handler.warn("expected XML version 1.0, not: " + version);
  781. xmlVersion = XML_11;
  782. }
  783. else if (!version.equals("1.0"))
  784. {
  785. error("illegal XML version", version, "1.0 or 1.1");
  786. }
  787. requireWhitespace();
  788. }
  789. // Read the encoding.
  790. require("encoding");
  791. parseEq();
  792. encodingName = readLiteral(flags);
  793. if (!ignoreEncoding)
  794. {
  795. setupDecoding(encodingName);
  796. }
  797. skipWhitespace();
  798. require("?>");
  799. return encodingName;
  800. }
  801. /**
  802. * Sets up internal state so that we can decode an entity using the
  803. * specified encoding. This is used when we start to read an entity
  804. * and we have been given knowledge of its encoding before we start to
  805. * read any data (e.g. from a SAX input source or from a MIME type).
  806. *
  807. * <p> It is also used after autodetection, at which point only very
  808. * limited adjustments to the encoding may be used (switching between
  809. * related builtin decoders).
  810. *
  811. * @param encodingName The name of the encoding specified by the user.
  812. * @exception IOException if the encoding isn't supported either
  813. * internally to this parser, or by the hosting JVM.
  814. * @see #parseXMLDecl
  815. * @see #parseTextDecl
  816. */
  817. private void setupDecoding(String encodingName)
  818. throws SAXException, IOException
  819. {
  820. encodingName = encodingName.toUpperCase();
  821. // ENCODING_EXTERNAL indicates an encoding that wasn't
  822. // autodetected ... we can use builtin decoders, or
  823. // ones from the JVM (InputStreamReader).
  824. // Otherwise we can only tweak what was autodetected, and
  825. // only for single byte (ASCII derived) builtin encodings.
  826. // ASCII-derived encodings
  827. if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL)
  828. {
  829. if (encodingName.equals("ISO-8859-1")
  830. || encodingName.equals("8859_1")
  831. || encodingName.equals("ISO8859_1"))
  832. {
  833. encoding = ENCODING_ISO_8859_1;
  834. return;
  835. }
  836. else if (encodingName.equals("US-ASCII")
  837. || encodingName.equals("ASCII"))
  838. {
  839. encoding = ENCODING_ASCII;
  840. return;
  841. }
  842. else if (encodingName.equals("UTF-8")
  843. || encodingName.equals("UTF8"))
  844. {
  845. encoding = ENCODING_UTF_8;
  846. return;
  847. }
  848. else if (encoding != ENCODING_EXTERNAL)
  849. {
  850. // used to start with a new reader ...
  851. throw new UnsupportedEncodingException(encodingName);
  852. }
  853. // else fallthrough ...
  854. // it's ASCII-ish and something other than a builtin
  855. }
  856. // Unicode and such
  857. if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21)
  858. {
  859. if (!(encodingName.equals("ISO-10646-UCS-2")
  860. || encodingName.equals("UTF-16")
  861. || encodingName.equals("UTF-16BE")
  862. || encodingName.equals("UTF-16LE")))
  863. {
  864. error("unsupported Unicode encoding", encodingName, "UTF-16");
  865. }
  866. return;
  867. }
  868. // four byte encodings
  869. if (encoding == ENCODING_UCS_4_1234
  870. || encoding == ENCODING_UCS_4_4321
  871. || encoding == ENCODING_UCS_4_2143
  872. || encoding == ENCODING_UCS_4_3412)
  873. {
  874. // Strictly: "UCS-4" == "UTF-32BE"; also, "UTF-32LE" exists
  875. if (!encodingName.equals("ISO-10646-UCS-4"))
  876. {
  877. error("unsupported 32-bit encoding", encodingName,
  878. "ISO-10646-UCS-4");
  879. }
  880. return;
  881. }
  882. // assert encoding == ENCODING_EXTERNAL
  883. // if (encoding != ENCODING_EXTERNAL)
  884. // throw new RuntimeException ("encoding = " + encoding);
  885. if (encodingName.equals("UTF-16BE"))
  886. {
  887. encoding = ENCODING_UCS_2_12;
  888. return;
  889. }
  890. if (encodingName.equals("UTF-16LE"))
  891. {
  892. encoding = ENCODING_UCS_2_21;
  893. return;
  894. }
  895. // We couldn't use the builtin decoders at all. But we can try to
  896. // create a reader, since we haven't messed up buffering. Tweak
  897. // the encoding name if necessary.
  898. if (encodingName.equals("UTF-16")
  899. || encodingName.equals("ISO-10646-UCS-2"))
  900. {
  901. encodingName = "Unicode";
  902. }
  903. // Ignoring all the EBCDIC aliases here
  904. reader = new InputStreamReader(is, encodingName);
  905. sourceType = INPUT_READER;
  906. }
  907. /**
  908. * Parse miscellaneous markup outside the document element and DOCTYPE
  909. * declaration.
  910. * <pre>
  911. * [27] Misc ::= Comment | PI | S
  912. * </pre>
  913. */
  914. private void parseMisc()
  915. throws Exception
  916. {
  917. while (true)
  918. {
  919. skipWhitespace();
  920. if (tryRead(startDelimPI))
  921. {
  922. parsePI();
  923. }
  924. else if (tryRead(startDelimComment))
  925. {
  926. parseComment();
  927. }
  928. else
  929. {
  930. return;
  931. }
  932. }
  933. }
  934. /**
  935. * Parse a document type declaration.
  936. * <pre>
  937. * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
  938. * ('[' (markupdecl | PEReference | S)* ']' S?)? '&gt;'
  939. * </pre>
  940. * <p> (The <code>&lt;!DOCTYPE</code> has already been read.)
  941. */
  942. private void parseDoctypedecl()
  943. throws Exception
  944. {
  945. String rootName;
  946. ExternalIdentifiers ids;
  947. // Read the document type name.
  948. requireWhitespace();
  949. rootName = readNmtoken(true);
  950. // Read the External subset's IDs
  951. skipWhitespace();
  952. ids = readExternalIds(false, true);
  953. // report (a) declaration of name, (b) lexical info (ids)
  954. handler.doctypeDecl(rootName, ids.publicId, ids.systemId);
  955. // Internal subset is parsed first, if present
  956. skipWhitespace();
  957. if (tryRead('['))
  958. {
  959. // loop until the subset ends
  960. while (true)
  961. {
  962. doReport = expandPE = true;
  963. skipWhitespace();
  964. doReport = expandPE = false;
  965. if (tryRead(']'))
  966. {
  967. break; // end of subset
  968. }
  969. else
  970. {
  971. // WFC, PEs in internal subset (only between decls)
  972. peIsError = expandPE = true;
  973. parseMarkupdecl();
  974. peIsError = expandPE = false;
  975. }
  976. }
  977. }
  978. skipWhitespace();
  979. require('>');
  980. // Read the external subset, if any
  981. InputSource subset;
  982. if (ids.systemId == null)
  983. {
  984. subset = handler.getExternalSubset(rootName,
  985. handler.getSystemId());
  986. }
  987. else
  988. {
  989. subset = null;
  990. }
  991. if (ids.systemId != null || subset != null)
  992. {
  993. pushString(null, ">");
  994. // NOTE: [dtd] is so we say what SAX2 expects,
  995. // though it's misleading (subset, not entire dtd)
  996. if (ids.systemId != null)
  997. {
  998. pushURL(true, "[dtd]", ids, null, null, null, true);
  999. }
  1000. else
  1001. {
  1002. handler.warn("modifying document by adding external subset");
  1003. pushURL(true, "[dtd]",
  1004. new ExternalIdentifiers(subset.getPublicId(),
  1005. subset.getSystemId(),
  1006. null),
  1007. subset.getCharacterStream(),
  1008. subset.getByteStream(),
  1009. subset.getEncoding(),
  1010. false);
  1011. }
  1012. // Loop until we end up back at '>'
  1013. while (true)
  1014. {
  1015. doReport = expandPE = true;
  1016. skipWhitespace();
  1017. doReport = expandPE = false;
  1018. if (tryRead('>'))
  1019. {
  1020. break;
  1021. }
  1022. else
  1023. {
  1024. expandPE = true;
  1025. parseMarkupdecl();
  1026. expandPE = false;
  1027. }
  1028. }
  1029. // the ">" string isn't popped yet
  1030. if (inputStack.size() != 1)
  1031. {
  1032. error("external subset has unmatched '>'");
  1033. }
  1034. }
  1035. // done dtd
  1036. handler.endDoctype();
  1037. expandPE = false;
  1038. doReport = true;
  1039. }
  1040. /**
  1041. * Parse a markup declaration in the internal or external DTD subset.
  1042. * <pre>
  1043. * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
  1044. * | NotationDecl | PI | Comment
  1045. * [30] extSubsetDecl ::= (markupdecl | conditionalSect
  1046. * | PEReference | S) *
  1047. * </pre>
  1048. * <p> Reading toplevel PE references is handled as a lexical issue
  1049. * by the caller, as is whitespace.
  1050. */
  1051. private void parseMarkupdecl()
  1052. throws Exception
  1053. {
  1054. char[] saved = null;
  1055. boolean savedPE = expandPE;
  1056. // prevent "<%foo;" and ensures saved entity is right
  1057. require('<');
  1058. unread('<');
  1059. expandPE = false;
  1060. if (tryRead("<!ELEMENT"))
  1061. {
  1062. saved = readBuffer;
  1063. expandPE = savedPE;
  1064. parseElementDecl();
  1065. }
  1066. else if (tryRead("<!ATTLIST"))
  1067. {
  1068. saved = readBuffer;
  1069. expandPE = savedPE;
  1070. parseAttlistDecl();
  1071. }
  1072. else if (tryRead("<!ENTITY"))
  1073. {
  1074. saved = readBuffer;
  1075. expandPE = savedPE;
  1076. parseEntityDecl();
  1077. }
  1078. else if (tryRead("<!NOTATION"))
  1079. {
  1080. saved = readBuffer;
  1081. expandPE = savedPE;
  1082. parseNotationDecl();
  1083. }
  1084. else if (tryRead(startDelimPI))
  1085. {
  1086. saved = readBuffer;
  1087. expandPE = savedPE;
  1088. parsePI();
  1089. }
  1090. else if (tryRead(startDelimComment))
  1091. {
  1092. saved = readBuffer;
  1093. expandPE = savedPE;
  1094. parseComment();
  1095. }
  1096. else if (tryRead("<!["))
  1097. {
  1098. saved = readBuffer;
  1099. expandPE = savedPE;
  1100. if (inputStack.size() > 0)
  1101. {
  1102. parseConditionalSect(saved);
  1103. }
  1104. else
  1105. {
  1106. error("conditional sections illegal in internal subset");
  1107. }
  1108. }
  1109. else
  1110. {
  1111. error("expected markup declaration");
  1112. }
  1113. // VC: Proper Decl/PE Nesting
  1114. if (readBuffer != saved)
  1115. {
  1116. handler.verror("Illegal Declaration/PE nesting");
  1117. }
  1118. }
  1119. /**
  1120. * Parse an element, with its tags.
  1121. * <pre>
  1122. * [39] element ::= EmptyElementTag | STag content ETag
  1123. * [40] STag ::= '&lt;' Name (S Attribute)* S? '&gt;'
  1124. * [44] EmptyElementTag ::= '&lt;' Name (S Attribute)* S? '/&gt;'
  1125. * </pre>
  1126. * <p> (The '&lt;' has already been read.)
  1127. * <p>NOTE: this method actually chains onto parseContent (), if necessary,
  1128. * and parseContent () will take care of calling parseETag ().
  1129. */
  1130. private void parseElement(boolean maybeGetSubset)
  1131. throws Exception
  1132. {
  1133. String gi;
  1134. char c;
  1135. int oldElementContent = currentElementContent;
  1136. String oldElement = currentElement;
  1137. ElementDecl element;
  1138. // This is the (global) counter for the
  1139. // array of specified attributes.
  1140. tagAttributePos = 0;
  1141. // Read the element type name.
  1142. gi = readNmtoken(true);
  1143. // If we saw no DTD, and this is the document root element,
  1144. // let the application modify the input stream by providing one.
  1145. if (maybeGetSubset)
  1146. {
  1147. InputSource subset = handler.getExternalSubset(gi,
  1148. handler.getSystemId());
  1149. if (subset != null)
  1150. {
  1151. String publicId = subset.getPublicId();
  1152. String systemId = subset.getSystemId();
  1153. handler.warn("modifying document by adding DTD");
  1154. handler.doctypeDecl(gi, publicId, systemId);
  1155. pushString(null, ">");
  1156. // NOTE: [dtd] is so we say what SAX2 expects,
  1157. // though it's misleading (subset, not entire dtd)
  1158. pushURL(true, "[dtd]",
  1159. new ExternalIdentifiers(publicId, systemId, null),
  1160. subset.getCharacterStream(),
  1161. subset.getByteStream(),
  1162. subset.getEncoding(),
  1163. false);
  1164. // Loop until we end up back at '>'
  1165. while (true)
  1166. {
  1167. doReport = expandPE = true;
  1168. skipWhitespace();
  1169. doReport = expandPE = false;
  1170. if (tryRead('>'))
  1171. {
  1172. break;
  1173. }
  1174. else
  1175. {
  1176. expandPE = true;
  1177. parseMarkupdecl();
  1178. expandPE = false;
  1179. }
  1180. }
  1181. // the ">" string isn't popped yet
  1182. if (inputStack.size() != 1)
  1183. {
  1184. error("external subset has unmatched '>'");
  1185. }
  1186. handler.endDoctype();
  1187. }
  1188. }
  1189. // Determine the current content type.
  1190. currentElement = gi;
  1191. element = (ElementDecl) elementInfo.get(gi);
  1192. currentElementContent = getContentType(element, CONTENT_ANY);
  1193. // Read the attributes, if any.
  1194. // After this loop, "c" is the closing delimiter.
  1195. boolean white = tryWhitespace();
  1196. c = readCh();
  1197. while (c != '/' && c != '>')
  1198. {
  1199. unread(c);
  1200. if (!white)
  1201. {
  1202. error("need whitespace between attributes");
  1203. }
  1204. parseAttribute(gi);
  1205. white = tryWhitespace();
  1206. c = readCh();
  1207. }
  1208. // Supply any defaulted attributes.
  1209. Iterator atts = declaredAttributes(element);
  1210. if (atts != null)
  1211. {
  1212. String aname;
  1213. loop:
  1214. while (atts.hasNext())
  1215. {
  1216. aname = (String) atts.next();
  1217. // See if it was specified.
  1218. for (int i = 0; i < tagAttributePos; i++)
  1219. {
  1220. if (tagAttributes[i] == aname)
  1221. {
  1222. continue loop;
  1223. }
  1224. }
  1225. // ... or has a default
  1226. String value = getAttributeDefaultValue(gi, aname);
  1227. if (value == null)
  1228. {
  1229. continue;
  1230. }
  1231. handler.attribute(aname, value, false);
  1232. }
  1233. }
  1234. // Figure out if this is a start tag
  1235. // or an empty element, and dispatch an
  1236. // event accordingly.
  1237. switch (c)
  1238. {
  1239. case '>':
  1240. handler.startElement(gi);
  1241. parseContent();
  1242. break;
  1243. case '/':
  1244. require('>');
  1245. handler.startElement(gi);
  1246. handler.endElement(gi);
  1247. break;
  1248. }
  1249. // Restore the previous state.
  1250. currentElement = oldElement;
  1251. currentElementContent = oldElementContent;
  1252. }
  1253. /**
  1254. * Parse an attribute assignment.
  1255. * <pre>
  1256. * [41] Attribute ::= Name Eq AttValue
  1257. * </pre>
  1258. * @param name The name of the attribute's element.
  1259. * @see SAXDriver#attribute
  1260. */
  1261. private void parseAttribute(String name)
  1262. throws Exception
  1263. {
  1264. String aname;
  1265. String type;
  1266. String value;
  1267. int flags = LIT_ATTRIBUTE | LIT_ENTITY_REF;
  1268. // Read the attribute name.
  1269. aname = readNmtoken(true);
  1270. type = getAttributeType(name, aname);
  1271. // Parse '='
  1272. parseEq();
  1273. // Read the value, normalizing whitespace
  1274. // unless it is CDATA.
  1275. if (handler.stringInterning)
  1276. {
  1277. if (type == "CDATA" || type == null)
  1278. {
  1279. value = readLiteral(flags);
  1280. }
  1281. else
  1282. {
  1283. value = readLiteral(flags | LIT_NORMALIZE);
  1284. }
  1285. }
  1286. else
  1287. {
  1288. if (type == null || type.equals("CDATA"))
  1289. {
  1290. value = readLiteral(flags);
  1291. }
  1292. else
  1293. {
  1294. value = readLiteral(flags | LIT_NORMALIZE);
  1295. }
  1296. }
  1297. // WFC: no duplicate attributes
  1298. for (int i = 0; i < tagAttributePos; i++)
  1299. {
  1300. if (aname.equals(tagAttributes [i]))
  1301. {
  1302. error("duplicate attribute", aname, null);
  1303. }
  1304. }
  1305. // Inform the handler about the
  1306. // attribute.
  1307. handler.attribute(aname, value, true);
  1308. dataBufferPos = 0;
  1309. // Note that the attribute has been
  1310. // specified.
  1311. if (tagAttributePos == tagAttributes.length)
  1312. {
  1313. String newAttrib[] = new String[tagAttributes.length * 2];
  1314. System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos);
  1315. tagAttributes = newAttrib;
  1316. }
  1317. tagAttributes[tagAttributePos++] = aname;
  1318. }
  1319. /**
  1320. * Parse an equals sign surrounded by optional whitespace.
  1321. * <pre>
  1322. * [25] Eq ::= S? '=' S?
  1323. * </pre>
  1324. */
  1325. private void parseEq()
  1326. throws SAXException, IOException
  1327. {
  1328. skipWhitespace();
  1329. require('=');
  1330. skipWhitespace();
  1331. }
  1332. /**
  1333. * Parse an end tag.
  1334. * <pre>
  1335. * [42] ETag ::= '</' Name S? '>'
  1336. * </pre>
  1337. * <p>NOTE: parseContent () chains to here, we already read the
  1338. * "&lt;/".
  1339. */
  1340. private void parseETag()
  1341. throws Exception
  1342. {
  1343. require(currentElement);
  1344. skipWhitespace();
  1345. require('>');
  1346. handler.endElement(currentElement);
  1347. // not re-reporting any SAXException re bogus end tags,
  1348. // even though that diagnostic might be clearer ...
  1349. }
  1350. /**
  1351. * Parse the content of an element.
  1352. * <pre>
  1353. * [43] content ::= (element | CharData | Reference
  1354. * | CDSect | PI | Comment)*
  1355. * [67] Reference ::= EntityRef | CharRef
  1356. * </pre>
  1357. * <p> NOTE: consumes ETtag.
  1358. */
  1359. private void parseContent()
  1360. throws Exception
  1361. {
  1362. char c;
  1363. while (true)
  1364. {
  1365. // consume characters (or ignorable whitspace) until delimiter
  1366. parseCharData();
  1367. // Handle delimiters
  1368. c = readCh();
  1369. switch (c)
  1370. {
  1371. case '&': // Found "&"
  1372. c = readCh();
  1373. if (c == '#')
  1374. {
  1375. parseCharRef();
  1376. }
  1377. else
  1378. {
  1379. unread(c);
  1380. parseEntityRef(true);
  1381. }
  1382. isDirtyCurrentElement = true;
  1383. break;
  1384. case '<': // Found "<"
  1385. dataBufferFlush();
  1386. c = readCh();
  1387. switch (c)
  1388. {
  1389. case '!': // Found "<!"
  1390. c = readCh();
  1391. switch (c)
  1392. {
  1393. case '-': // Found "<!-"
  1394. require('-');
  1395. isDirtyCurrentElement = false;
  1396. parseComment();
  1397. break;
  1398. case '[': // Found "<!["
  1399. isDirtyCurrentElement = false;
  1400. require("CDATA[");
  1401. handler.startCDATA();
  1402. inCDATA = true;
  1403. parseCDSect();
  1404. inCDATA = false;
  1405. handler.endCDATA();
  1406. break;
  1407. default:
  1408. error("expected comment or CDATA section", c, null);
  1409. break;
  1410. }
  1411. break;
  1412. case '?': // Found "<?"
  1413. isDirtyCurrentElement = false;
  1414. parsePI();
  1415. break;
  1416. case '/': // Found "</"
  1417. isDirtyCurrentElement = false;
  1418. parseETag();
  1419. return;
  1420. default: // Found "<" followed by something else
  1421. isDirtyCurrentElement = false;
  1422. unread(c);
  1423. parseElement(false);
  1424. break;
  1425. }
  1426. }
  1427. }
  1428. }
  1429. /**
  1430. * Parse an element type declaration.
  1431. * <pre>
  1432. * [45] elementdecl ::= '&lt;!ELEMENT' S Name S contentspec S? '&gt;'
  1433. * </pre>
  1434. * <p> NOTE: the '&lt;!ELEMENT' has already been read.
  1435. */
  1436. private void parseElementDecl()
  1437. throws Exception
  1438. {
  1439. String name;
  1440. requireWhitespace();
  1441. // Read the element type name.
  1442. name = readNmtoken(true);
  1443. requireWhitespace();
  1444. // Read the content model.
  1445. parseContentspec(name);
  1446. skipWhitespace();
  1447. require('>');
  1448. }
  1449. /**
  1450. * Content specification.
  1451. * <pre>
  1452. * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
  1453. * </pre>
  1454. */
  1455. private void parseContentspec(String name)
  1456. throws Exception
  1457. {
  1458. // FIXME: move elementDecl() into setElement(), pass EMTPY/ANY ...
  1459. if (tryRead("EMPTY"))
  1460. {
  1461. setElement(name, CONTENT_EMPTY, null, null);
  1462. if (!skippedPE)
  1463. {
  1464. handler.getDeclHandler().elementDecl(name, "EMPTY");
  1465. }
  1466. return;
  1467. }
  1468. else if (tryRead("ANY"))
  1469. {
  1470. setElement(name, CONTENT_ANY, null, null);
  1471. if (!skippedPE)
  1472. {
  1473. handler.getDeclHandler().elementDecl(name, "ANY");
  1474. }
  1475. return;
  1476. }
  1477. else
  1478. {
  1479. String model;
  1480. char[] saved;
  1481. require('(');
  1482. saved = readBuffer;
  1483. dataBufferAppend('(');
  1484. skipWhitespace();
  1485. if (tryRead("#PCDATA"))
  1486. {
  1487. dataBufferAppend("#PCDATA");
  1488. parseMixed(saved);
  1489. model = dataBufferToString();
  1490. setElement(name, CONTENT_MIXED, model, null);
  1491. }
  1492. else
  1493. {
  1494. parseElements(saved);
  1495. model = dataBufferToString();
  1496. setElement(name, CONTENT_ELEMENTS, model, null);
  1497. }
  1498. if (!skippedPE)
  1499. {
  1500. handler.getDeclHandler().elementDecl(name, model);
  1501. }
  1502. }
  1503. }
  1504. /**
  1505. * Parse an element-content model.
  1506. * <pre>
  1507. * [47] elements ::= (choice | seq) ('?' | '*' | '+')?
  1508. * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')'
  1509. * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
  1510. * </pre>
  1511. *
  1512. * <p> NOTE: the opening '(' and S have already been read.
  1513. *
  1514. * @param saved Buffer for entity that should have the terminal ')'
  1515. */
  1516. private void parseElements(char[] saved)
  1517. throws Exception
  1518. {
  1519. char c;
  1520. char sep;
  1521. // Parse the first content particle
  1522. skipWhitespace();
  1523. parseCp();
  1524. // Check for end or for a separator.
  1525. skipWhitespace();
  1526. c = readCh();
  1527. switch (c)
  1528. {
  1529. case ')':
  1530. // VC: Proper Group/PE Nesting
  1531. if (readBuffer != saved)
  1532. {
  1533. handler.verror("Illegal Group/PE nesting");
  1534. }
  1535. dataBufferAppend(')');
  1536. c = readCh();
  1537. switch (c)
  1538. {
  1539. case '*':
  1540. case '+':
  1541. case '?':
  1542. dataBufferAppend(c);
  1543. break;
  1544. default:
  1545. unread(c);
  1546. }
  1547. return;
  1548. case ',': // Register the separator.
  1549. case '|':
  1550. sep = c;
  1551. dataBufferAppend(c);
  1552. break;
  1553. default:
  1554. error("bad separator in content model", c, null);
  1555. return;
  1556. }
  1557. // Parse the rest of the content model.
  1558. while (true)
  1559. {
  1560. skipWhitespace();
  1561. parseCp();
  1562. skipWhitespace();
  1563. c = readCh();
  1564. if (c == ')')
  1565. {
  1566. // VC: Proper Group/PE Nesting
  1567. if (readBuffer != saved)
  1568. {
  1569. handler.verror("Illegal Group/PE nesting");
  1570. }
  1571. dataBufferAppend(')');
  1572. break;
  1573. }
  1574. else if (c != sep)
  1575. {
  1576. error("bad separator in content model", c, null);
  1577. return;
  1578. }
  1579. else
  1580. {
  1581. dataBufferAppend(c);
  1582. }
  1583. }
  1584. // Check for the occurrence indicator.
  1585. c = readCh();
  1586. switch (c)
  1587. {
  1588. case '?':
  1589. case '*':
  1590. case '+':
  1591. dataBufferAppend(c);
  1592. return;
  1593. default:
  1594. unread(c);
  1595. return;
  1596. }
  1597. }
  1598. /**
  1599. * Parse a content particle.
  1600. * <pre>
  1601. * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
  1602. * </pre>
  1603. */
  1604. private void parseCp()
  1605. throws Exception
  1606. {
  1607. if (tryRead('('))
  1608. {
  1609. dataBufferAppend('(');
  1610. parseElements(readBuffer);
  1611. }
  1612. else
  1613. {
  1614. dataBufferAppend(readNmtoken(true));
  1615. char c = readCh();
  1616. switch (c)
  1617. {
  1618. case '?':
  1619. case '*':
  1620. case '+':
  1621. dataBufferAppend(c);
  1622. break;
  1623. default:
  1624. unread(c);
  1625. break;
  1626. }
  1627. }
  1628. }
  1629. /**
  1630. * Parse mixed content.
  1631. * <pre>
  1632. * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*'
  1633. * | '(' S? ('#PCDATA') S? ')'
  1634. * </pre>
  1635. *
  1636. * @param saved Buffer for entity that should have the terminal ')'
  1637. */
  1638. private void parseMixed(char[] saved)
  1639. throws Exception
  1640. {
  1641. // Check for PCDATA alone.
  1642. skipWhitespace();
  1643. if (tryRead(')'))
  1644. {
  1645. // VC: Proper Group/PE Nesting
  1646. if (readBuffer != saved)
  1647. {
  1648. handler.verror("Illegal Group/PE nesting");
  1649. }
  1650. dataBufferAppend(")*");
  1651. tryRead('*');
  1652. return;
  1653. }
  1654. // Parse mixed content.
  1655. skipWhitespace();
  1656. while (!tryRead(")"))
  1657. {
  1658. require('|');
  1659. dataBufferAppend('|');
  1660. skipWhitespace();
  1661. dataBufferAppend(readNmtoken(true));
  1662. skipWhitespace();
  1663. }
  1664. // VC: Proper Group/PE Nesting
  1665. if (readBuffer != saved)
  1666. {
  1667. handler.verror("Illegal Group/PE nesting");
  1668. }
  1669. require('*');
  1670. dataBufferAppend(")*");
  1671. }
  1672. /**
  1673. * Parse an attribute list declaration.
  1674. * <pre>
  1675. * [52] AttlistDecl ::= '&lt;!ATTLIST' S Name AttDef* S? '&gt;'
  1676. * </pre>
  1677. * <p>NOTE: the '&lt;!ATTLIST' has already been read.
  1678. */
  1679. private void parseAttlistDecl()
  1680. throws Exception
  1681. {
  1682. String elementName;
  1683. requireWhitespace();
  1684. elementName = readNmtoken(true);
  1685. boolean white = tryWhitespace();
  1686. while (!tryRead('>'))
  1687. {
  1688. if (!white)
  1689. {
  1690. error("whitespace required before attribute definition");
  1691. }
  1692. parseAttDef(elementName);
  1693. white = tryWhitespace();
  1694. }
  1695. }
  1696. /**
  1697. * Parse a single attribute definition.
  1698. * <pre>
  1699. * [53] AttDef ::= S Name S AttType S DefaultDecl
  1700. * </pre>
  1701. */
  1702. private void parseAttDef(String elementName)
  1703. throws Exception
  1704. {
  1705. String name;
  1706. String type;
  1707. String enumer = null;
  1708. // Read the attribute name.
  1709. name = readNmtoken(true);
  1710. // Read the attribute type.
  1711. requireWhitespace();
  1712. type = readAttType();
  1713. // Get the string of enumerated values if necessary.
  1714. if (handler.stringInterning)
  1715. {
  1716. if ("ENUMERATION" == type || "NOTATION" == type)
  1717. {
  1718. enumer = dataBufferToString();
  1719. }
  1720. }
  1721. else
  1722. {
  1723. if ("ENUMERATION".equals(type) || "NOTATION".equals(type))
  1724. {
  1725. enumer = dataBufferToString();
  1726. }
  1727. }
  1728. // Read the default value.
  1729. requireWhitespace();
  1730. parseDefault(elementName, name, type, enumer);
  1731. }
  1732. /**
  1733. * Parse the attribute type.
  1734. * <pre>
  1735. * [54] AttType ::= StringType | TokenizedType | EnumeratedType
  1736. * [55] StringType ::= 'CDATA'
  1737. * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
  1738. * | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
  1739. * [57] EnumeratedType ::= NotationType | Enumeration
  1740. * </pre>
  1741. */
  1742. private String readAttType()
  1743. throws Exception
  1744. {
  1745. if (tryRead('('))
  1746. {
  1747. parseEnumeration(false);
  1748. return "ENUMERATION";
  1749. }
  1750. else
  1751. {
  1752. String typeString = readNmtoken(true);
  1753. if (handler.stringInterning)
  1754. {
  1755. if ("NOTATION" == typeString)
  1756. {
  1757. parseNotationType();
  1758. return typeString;
  1759. }
  1760. else if ("CDATA" == typeString
  1761. || "ID" == typeString
  1762. || "IDREF" == typeString
  1763. || "IDREFS" == typeString
  1764. || "ENTITY" == typeString
  1765. || "ENTITIES" == typeString
  1766. || "NMTOKEN" == typeString
  1767. || "NMTOKENS" == typeString)
  1768. {
  1769. return typeString;
  1770. }
  1771. }
  1772. else
  1773. {
  1774. if ("NOTATION".equals(typeString))
  1775. {
  1776. parseNotationType();
  1777. return typeString;
  1778. }
  1779. else if ("CDATA".equals(typeString)
  1780. || "ID".equals(typeString)
  1781. || "IDREF".equals(typeString)
  1782. || "IDREFS".equals(typeString)
  1783. || "ENTITY".equals(typeString)
  1784. || "ENTITIES".equals(typeString)
  1785. || "NMTOKEN".equals(typeString)
  1786. || "NMTOKENS".equals(typeString))
  1787. {
  1788. return typeString;
  1789. }
  1790. }
  1791. error("illegal attribute type", typeString, null);
  1792. return null;
  1793. }
  1794. }
  1795. /**
  1796. * Parse an enumeration.
  1797. * <pre>
  1798. * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
  1799. * </pre>
  1800. * <p>NOTE: the '(' has already been read.
  1801. */
  1802. private void parseEnumeration(boolean isNames)
  1803. throws Exception
  1804. {
  1805. dataBufferAppend('(');
  1806. // Read the first token.
  1807. skipWhitespace();
  1808. dataBufferAppend(readNmtoken(isNames));
  1809. // Read the remaining tokens.
  1810. skipWhitespace();
  1811. while (!tryRead(')'))
  1812. {
  1813. require('|');
  1814. dataBufferAppend('|');
  1815. skipWhitespace();
  1816. dataBufferAppend(readNmtoken (isNames));
  1817. skipWhitespace();
  1818. }
  1819. dataBufferAppend(')');
  1820. }
  1821. /**
  1822. * Parse a notation type for an attribute.
  1823. * <pre>
  1824. * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks
  1825. * (S? '|' S? name)* S? ')'
  1826. * </pre>
  1827. * <p>NOTE: the 'NOTATION' has already been read
  1828. */
  1829. private void parseNotationType()
  1830. throws Exception
  1831. {
  1832. requireWhitespace();
  1833. require('(');
  1834. parseEnumeration(true);
  1835. }
  1836. /**
  1837. * Parse the default value for an attribute.
  1838. * <pre>
  1839. * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
  1840. * | (('#FIXED' S)? AttValue)
  1841. * </pre>
  1842. */
  1843. private void parseDefault(String elementName, String name,
  1844. String type, String enumer)
  1845. throws Exception
  1846. {
  1847. int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
  1848. String value = null;
  1849. int flags = LIT_ATTRIBUTE;
  1850. boolean saved = expandPE;
  1851. String defaultType = null;
  1852. // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace
  1853. // chars to spaces (doesn't matter when that's done if it doesn't
  1854. // interfere with char refs expanding to whitespace).
  1855. if (!skippedPE)
  1856. {
  1857. flags |= LIT_ENTITY_REF;
  1858. if (handler.stringInterning)
  1859. {
  1860. if ("CDATA" != type)
  1861. {
  1862. flags |= LIT_NORMALIZE;
  1863. }
  1864. }
  1865. else
  1866. {
  1867. if (!"CDATA".equals(type))
  1868. {
  1869. flags |= LIT_NORMALIZE;
  1870. }
  1871. }
  1872. }
  1873. expandPE = false;
  1874. if (tryRead('#'))
  1875. {
  1876. if (tryRead("FIXED"))
  1877. {
  1878. defaultType = "#FIXED";
  1879. valueType = ATTRIBUTE_DEFAULT_FIXED;
  1880. requireWhitespace();
  1881. value = readLiteral(flags);
  1882. }
  1883. else if (tryRead("REQUIRED"))
  1884. {
  1885. defaultType = "#REQUIRED";
  1886. valueType = ATTRIBUTE_DEFAULT_REQUIRED;
  1887. }
  1888. else if (tryRead("IMPLIED"))
  1889. {
  1890. defaultType = "#IMPLIED";
  1891. valueType = ATTRIBUTE_DEFAULT_IMPLIED;
  1892. }
  1893. else
  1894. {
  1895. error("illegal keyword for attribute default value");
  1896. }
  1897. }
  1898. else
  1899. {
  1900. value = readLiteral(flags);
  1901. }
  1902. expandPE = saved;
  1903. setAttribute(elementName, name, type, enumer, value, valueType);
  1904. if (handler.stringInterning)
  1905. {
  1906. if ("ENUMERATION" == type)
  1907. {
  1908. type = enumer;
  1909. }
  1910. else if ("NOTATION" == type)
  1911. {
  1912. type = "NOTATION " + enumer;
  1913. }
  1914. }
  1915. else
  1916. {
  1917. if ("ENUMERATION".equals(type))
  1918. {
  1919. type = enumer;
  1920. }
  1921. else if ("NOTATION".equals(type))
  1922. {
  1923. type = "NOTATION " + enumer;
  1924. }
  1925. }
  1926. if (!skippedPE)
  1927. {
  1928. handler.getDeclHandler().attributeDecl(elementName, name, type,
  1929. defaultType, value);
  1930. }
  1931. }
  1932. /**
  1933. * Parse a conditional section.
  1934. * <pre>
  1935. * [61] conditionalSect ::= includeSect || ignoreSect
  1936. * [62] includeSect ::= '&lt;![' S? 'INCLUDE' S? '['
  1937. * extSubsetDecl ']]&gt;'
  1938. * [63] ignoreSect ::= '&lt;![' S? 'IGNORE' S? '['
  1939. * ignoreSectContents* ']]&gt;'
  1940. * [64] ignoreSectContents ::= Ignore
  1941. * ('&lt;![' ignoreSectContents* ']]&gt;' Ignore )*
  1942. * [65] Ignore ::= Char* - (Char* ( '&lt;![' | ']]&gt;') Char* )
  1943. * </pre>
  1944. * <p> NOTE: the '&gt;![' has already been read.
  1945. */
  1946. private void parseConditionalSect(char[] saved)
  1947. throws Exception
  1948. {
  1949. skipWhitespace();
  1950. if (tryRead("INCLUDE"))
  1951. {
  1952. skipWhitespace();
  1953. require('[');
  1954. // VC: Proper Conditional Section/PE Nesting
  1955. if (readBuffer != saved)
  1956. {
  1957. handler.verror("Illegal Conditional Section/PE nesting");
  1958. }
  1959. skipWhitespace();
  1960. while (!tryRead("]]>"))
  1961. {
  1962. parseMarkupdecl();
  1963. skipWhitespace();
  1964. }
  1965. }
  1966. else if (tryRead("IGNORE"))
  1967. {
  1968. skipWhitespace();
  1969. require('[');
  1970. // VC: Proper Conditional Section/PE Nesting
  1971. if (readBuffer != saved)
  1972. {
  1973. handler.verror("Illegal Conditional Section/PE nesting");
  1974. }
  1975. int nesting = 1;
  1976. char c;
  1977. expandPE = false;
  1978. for (int nest = 1; nest > 0; )
  1979. {
  1980. c = readCh();
  1981. switch (c)
  1982. {
  1983. case '<':
  1984. if (tryRead("!["))
  1985. {
  1986. nest++;
  1987. }
  1988. break;
  1989. case ']':
  1990. if (tryRead("]>"))
  1991. {
  1992. nest--;
  1993. }
  1994. }
  1995. }
  1996. expandPE = true;
  1997. }
  1998. else
  1999. {
  2000. error("conditional section must begin with INCLUDE or IGNORE");
  2001. }
  2002. }
  2003. private void parseCharRef()
  2004. throws SAXException, IOException
  2005. {
  2006. parseCharRef(true /* do flushDataBuffer by default */);
  2007. }
  2008. /**
  2009. * Try to read a character reference without consuming data from buffer.
  2010. * <pre>
  2011. * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
  2012. * </pre>
  2013. * <p>NOTE: the '&#' has already been read.
  2014. */
  2015. private void tryReadCharRef()
  2016. throws SAXException, IOException
  2017. {
  2018. int value = 0;
  2019. char c;
  2020. if (tryRead('x'))
  2021. {
  2022. loop1:
  2023. while (true)
  2024. {
  2025. c = readCh();
  2026. if (c == ';')
  2027. {
  2028. break loop1;
  2029. }
  2030. else
  2031. {
  2032. int n = Character.digit(c, 16);
  2033. if (n == -1)
  2034. {
  2035. error("illegal character in character reference", c, null);
  2036. break loop1;
  2037. }
  2038. value *= 16;
  2039. value += n;
  2040. }
  2041. }
  2042. }
  2043. else
  2044. {
  2045. loop2:
  2046. while (true)
  2047. {
  2048. c = readCh();
  2049. if (c == ';')
  2050. {
  2051. break loop2;
  2052. }
  2053. else
  2054. {
  2055. int n = Character.digit(c, 10);
  2056. if (n == -1)
  2057. {
  2058. error("illegal character in character reference", c, null);
  2059. break loop2;
  2060. }
  2061. value *= 10;
  2062. value += n;
  2063. }
  2064. }
  2065. }
  2066. // check for character refs being legal XML
  2067. if ((value < 0x0020
  2068. && ! (value == '\n' || value == '\t' || value == '\r'))
  2069. || (value >= 0xD800 && value <= 0xDFFF)
  2070. || value == 0xFFFE || value == 0xFFFF
  2071. || value > 0x0010ffff)
  2072. {
  2073. error("illegal XML character reference U+"
  2074. + Integer.toHexString(value));
  2075. }
  2076. // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
  2077. // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
  2078. if (value > 0x0010ffff)
  2079. {
  2080. // too big for surrogate
  2081. error("character reference " + value + " is too large for UTF-16",
  2082. Integer.toString(value), null);
  2083. }
  2084. }
  2085. /**
  2086. * Read and interpret a character reference.
  2087. * <pre>
  2088. * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
  2089. * </pre>
  2090. * <p>NOTE: the '&#' has already been read.
  2091. */
  2092. private void parseCharRef(boolean doFlush)
  2093. throws SAXException, IOException
  2094. {
  2095. int value = 0;
  2096. char c;
  2097. if (tryRead('x'))
  2098. {
  2099. loop1:
  2100. while (true)
  2101. {
  2102. c = readCh();
  2103. if (c == ';')
  2104. {
  2105. break loop1;
  2106. }
  2107. else
  2108. {
  2109. int n = Character.digit(c, 16);
  2110. if (n == -1)
  2111. {
  2112. error("illegal character in character reference", c, null);
  2113. break loop1;
  2114. }
  2115. value *= 16;
  2116. value += n;
  2117. }
  2118. }
  2119. }
  2120. else
  2121. {
  2122. loop2:
  2123. while (true)
  2124. {
  2125. c = readCh();
  2126. if (c == ';')
  2127. {
  2128. break loop2;
  2129. }
  2130. else
  2131. {
  2132. int n = Character.digit(c, 10);
  2133. if (n == -1)
  2134. {
  2135. error("illegal character in character reference", c, null);
  2136. break loop2;
  2137. }
  2138. value *= 10;
  2139. value += c - '0';
  2140. }
  2141. }
  2142. }
  2143. // check for character refs being legal XML
  2144. if ((value < 0x0020
  2145. && ! (value == '\n' || value == '\t' || value == '\r'))
  2146. || (value >= 0xD800 && value <= 0xDFFF)
  2147. || value == 0xFFFE || value == 0xFFFF
  2148. || value > 0x0010ffff)
  2149. {
  2150. error("illegal XML character reference U+"
  2151. + Integer.toHexString(value));
  2152. }
  2153. // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
  2154. // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
  2155. if (value <= 0x0000ffff)
  2156. {
  2157. // no surrogates needed
  2158. dataBufferAppend((char) value);
  2159. }
  2160. else if (value <= 0x0010ffff)
  2161. {
  2162. value -= 0x10000;
  2163. // > 16 bits, surrogate needed
  2164. dataBufferAppend((char) (0xd800 | (value >> 10)));
  2165. dataBufferAppend((char) (0xdc00 | (value & 0x0003ff)));
  2166. }
  2167. else
  2168. {
  2169. // too big for surrogate
  2170. error("character reference " + value + " is too large for UTF-16",
  2171. Integer.toString(value), null);
  2172. }
  2173. if (doFlush)
  2174. {
  2175. dataBufferFlush();
  2176. }
  2177. }
  2178. /**
  2179. * Parse and expand an entity reference.
  2180. * <pre>
  2181. * [68] EntityRef ::= '&' Name ';'
  2182. * </pre>
  2183. * <p>NOTE: the '&amp;' has already been read.
  2184. * @param externalAllowed External entities are allowed here.
  2185. */
  2186. private void parseEntityRef(boolean externalAllowed)
  2187. throws SAXException, IOException
  2188. {
  2189. String name;
  2190. name = readNmtoken(true);
  2191. require(';');
  2192. switch (getEntityType(name))
  2193. {
  2194. case ENTITY_UNDECLARED:
  2195. // NOTE: XML REC describes amazingly convoluted handling for
  2196. // this case. Nothing as meaningful as being a WFness error
  2197. // unless the processor might _legitimately_ not have seen a
  2198. // declaration ... which is what this implements.
  2199. String message;
  2200. message = "reference to undeclared general entity " + name;
  2201. if (skippedPE && !docIsStandalone)
  2202. {
  2203. handler.verror(message);
  2204. // we don't know this entity, and it might be external...
  2205. if (externalAllowed)
  2206. {
  2207. handler.skippedEntity(name);
  2208. }
  2209. }
  2210. else
  2211. {
  2212. error(message);
  2213. }
  2214. break;
  2215. case ENTITY_INTERNAL:
  2216. pushString(name, getEntityValue(name));
  2217. //workaround for possible input pop before marking
  2218. //the buffer reading position
  2219. char t = readCh();
  2220. unread(t);
  2221. int bufferPosMark = readBufferPos;
  2222. int end = readBufferPos + getEntityValue(name).length();
  2223. for (int k = readBufferPos; k < end; k++)
  2224. {
  2225. t = readCh();
  2226. if (t == '&')
  2227. {
  2228. t = readCh();
  2229. if (t == '#')
  2230. {
  2231. //try to match a character ref
  2232. tryReadCharRef();
  2233. //everything has been read
  2234. if (readBufferPos >= end)
  2235. {
  2236. break;
  2237. }
  2238. k = readBufferPos;
  2239. continue;
  2240. }
  2241. else if (Character.isLetter(t))
  2242. {
  2243. //looks like an entity ref
  2244. unread(t);
  2245. readNmtoken(true);
  2246. require(';');
  2247. //everything has been read
  2248. if (readBufferPos >= end)
  2249. {
  2250. break;
  2251. }
  2252. k = readBufferPos;
  2253. continue;
  2254. }
  2255. error(" malformed entity reference");
  2256. }
  2257. }
  2258. readBufferPos = bufferPosMark;
  2259. break;
  2260. case ENTITY_TEXT:
  2261. if (externalAllowed)
  2262. {
  2263. pushURL(false, name, getEntityIds(name),
  2264. null, null, null, true);
  2265. }
  2266. else
  2267. {
  2268. error("reference to external entity in attribute value.",
  2269. name, null);
  2270. }
  2271. break;
  2272. case ENTITY_NDATA:
  2273. if (externalAllowed)
  2274. {
  2275. error("unparsed entity reference in content", name, null);
  2276. }
  2277. else
  2278. {
  2279. error("reference to external entity in attribute value.",
  2280. name, null);
  2281. }
  2282. break;
  2283. default:
  2284. throw new RuntimeException();
  2285. }
  2286. }
  2287. /**
  2288. * Parse and expand a parameter entity reference.
  2289. * <pre>
  2290. * [69] PEReference ::= '%' Name ';'
  2291. * </pre>
  2292. * <p>NOTE: the '%' has already been read.
  2293. */
  2294. private void parsePEReference()
  2295. throws SAXException, IOException
  2296. {
  2297. String name;
  2298. name = "%" + readNmtoken(true);
  2299. require(';');
  2300. switch (getEntityType(name))
  2301. {
  2302. case ENTITY_UNDECLARED:
  2303. // VC: Entity Declared
  2304. handler.verror("reference to undeclared parameter entity " + name);
  2305. // we should disable handling of all subsequent declarations
  2306. // unless this is a standalone document (info discarded)
  2307. break;
  2308. case ENTITY_INTERNAL:
  2309. if (inLiteral)
  2310. {
  2311. pushString(name, getEntityValue(name));
  2312. }
  2313. else
  2314. {
  2315. pushString(name, ' ' + getEntityValue(name) + ' ');
  2316. }
  2317. break;
  2318. case ENTITY_TEXT:
  2319. if (!inLiteral)
  2320. {
  2321. pushString(null, " ");
  2322. }
  2323. pushURL(true, name, getEntityIds(name), null, null, null, true);
  2324. if (!inLiteral)
  2325. {
  2326. pushString(null, " ");
  2327. }
  2328. break;
  2329. }
  2330. }
  2331. /**
  2332. * Parse an entity declaration.
  2333. * <pre>
  2334. * [70] EntityDecl ::= GEDecl | PEDecl
  2335. * [71] GEDecl ::= '&lt;!ENTITY' S Name S EntityDef S? '&gt;'
  2336. * [72] PEDecl ::= '&lt;!ENTITY' S '%' S Name S PEDef S? '&gt;'
  2337. * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
  2338. * [74] PEDef ::= EntityValue | ExternalID
  2339. * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
  2340. * | 'PUBLIC' S PubidLiteral S SystemLiteral
  2341. * [76] NDataDecl ::= S 'NDATA' S Name
  2342. * </pre>
  2343. * <p>NOTE: the '&lt;!ENTITY' has already been read.
  2344. */
  2345. private void parseEntityDecl()
  2346. throws Exception
  2347. {
  2348. boolean peFlag = false;
  2349. int flags = 0;
  2350. // Check for a parameter entity.
  2351. expandPE = false;
  2352. requireWhitespace();
  2353. if (tryRead('%'))
  2354. {
  2355. peFlag = true;
  2356. requireWhitespace();
  2357. }
  2358. expandPE = true;
  2359. // Read the entity name, and prepend
  2360. // '%' if necessary.
  2361. String name = readNmtoken(true);
  2362. //NE08
  2363. if (name.indexOf(':') >= 0)
  2364. {
  2365. error("Illegal character(':') in entity name ", name, null);
  2366. }
  2367. if (peFlag)
  2368. {
  2369. name = "%" + name;
  2370. }
  2371. // Read the entity value.
  2372. requireWhitespace();
  2373. char c = readCh();
  2374. unread (c);
  2375. if (c == '"' || c == '\'')
  2376. {
  2377. // Internal entity ... replacement text has expanded refs
  2378. // to characters and PEs, but not to general entities
  2379. String value = readLiteral(flags);
  2380. setInternalEntity(name, value);
  2381. }
  2382. else
  2383. {
  2384. // Read the external IDs
  2385. ExternalIdentifiers ids = readExternalIds(false, false);
  2386. // Check for NDATA declaration.
  2387. boolean white = tryWhitespace();
  2388. if (!peFlag && tryRead("NDATA"))
  2389. {
  2390. if (!white)
  2391. {
  2392. error("whitespace required before NDATA");
  2393. }
  2394. requireWhitespace();
  2395. String notationName = readNmtoken(true);
  2396. if (!skippedPE)
  2397. {
  2398. setExternalEntity(name, ENTITY_NDATA, ids, notationName);
  2399. handler.unparsedEntityDecl(name, ids.publicId, ids.systemId,
  2400. ids.baseUri, notationName);
  2401. }
  2402. }
  2403. else if (!skippedPE)
  2404. {
  2405. setExternalEntity(name, ENTITY_TEXT, ids, null);
  2406. handler.getDeclHandler()
  2407. .externalEntityDecl(name, ids.publicId,
  2408. handler.resolveURIs()
  2409. // FIXME: ASSUMES not skipped
  2410. // "false" forces error on bad URI
  2411. ? handler.absolutize(ids.baseUri,
  2412. ids.systemId,
  2413. false)
  2414. : ids.systemId);
  2415. }
  2416. }
  2417. // Finish the declaration.
  2418. skipWhitespace();
  2419. require('>');
  2420. }
  2421. /**
  2422. * Parse a notation declaration.
  2423. * <pre>
  2424. * [82] NotationDecl ::= '&lt;!NOTATION' S Name S
  2425. * (ExternalID | PublicID) S? '&gt;'
  2426. * [83] PublicID ::= 'PUBLIC' S PubidLiteral
  2427. * </pre>
  2428. * <P>NOTE: the '&lt;!NOTATION' has already been read.
  2429. */
  2430. private void parseNotationDecl()
  2431. throws Exception
  2432. {
  2433. String nname;
  2434. ExternalIdentifiers ids;
  2435. requireWhitespace();
  2436. nname = readNmtoken(true);
  2437. //NE08
  2438. if (nname.indexOf(':') >= 0)
  2439. {
  2440. error("Illegal character(':') in notation name ", nname, null);
  2441. }
  2442. requireWhitespace();
  2443. // Read the external identifiers.
  2444. ids = readExternalIds(true, false);
  2445. // Register the notation.
  2446. setNotation(nname, ids);
  2447. skipWhitespace();
  2448. require('>');
  2449. }
  2450. /**
  2451. * Parse character data.
  2452. * <pre>
  2453. * [14] CharData ::= [^&lt;&amp;]* - ([^&lt;&amp;]* ']]&gt;' [^&lt;&amp;]*)
  2454. * </pre>
  2455. */
  2456. private void parseCharData()
  2457. throws Exception
  2458. {
  2459. char c;
  2460. int state = 0;
  2461. boolean pureWhite = false;
  2462. // assert (dataBufferPos == 0);
  2463. // are we expecting pure whitespace? it might be dirty...
  2464. if ((currentElementContent == CONTENT_ELEMENTS) && !isDirtyCurrentElement)
  2465. {
  2466. pureWhite = true;
  2467. }
  2468. // always report right out of readBuffer
  2469. // to minimize (pointless) buffer copies
  2470. while (true)
  2471. {
  2472. int lineAugment = 0;
  2473. int columnAugment = 0;
  2474. int i;
  2475. loop:
  2476. for (i = readBufferPos; i < readBufferLength; i++)
  2477. {
  2478. switch (c = readBuffer[i])
  2479. {
  2480. case '\n':
  2481. lineAugment++;
  2482. columnAugment = 0;
  2483. // pureWhite unmodified
  2484. break;
  2485. case '\r': // should not happen!!
  2486. case '\t':
  2487. case ' ':
  2488. // pureWhite unmodified
  2489. columnAugment++;
  2490. break;
  2491. case '&':
  2492. case '<':
  2493. columnAugment++;
  2494. // pureWhite unmodified
  2495. // CLEAN end of text sequence
  2496. state = 1;
  2497. break loop;
  2498. case ']':
  2499. // that's not a whitespace char, and
  2500. // can not terminate pure whitespace either
  2501. pureWhite = false;
  2502. if ((i + 2) < readBufferLength)
  2503. {
  2504. if (readBuffer [i + 1] == ']'
  2505. && readBuffer [i + 2] == '>')
  2506. {
  2507. // ERROR end of text sequence
  2508. state = 2;
  2509. break loop;
  2510. }
  2511. }
  2512. else
  2513. {
  2514. // FIXME missing two end-of-buffer cases
  2515. }
  2516. columnAugment++;
  2517. break;
  2518. default:
  2519. if ((c < 0x0020 || c > 0xFFFD)
  2520. || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085)
  2521. && xmlVersion == XML_11))
  2522. {
  2523. error("illegal XML character U+"
  2524. + Integer.toHexString(c));
  2525. }
  2526. // that's not a whitespace char
  2527. pureWhite = false;
  2528. columnAugment++;
  2529. }
  2530. }
  2531. // report text thus far
  2532. if (lineAugment > 0)
  2533. {
  2534. line += lineAugment;
  2535. column = columnAugment;
  2536. }
  2537. else
  2538. {
  2539. column += columnAugment;
  2540. }
  2541. // report characters/whitspace
  2542. int length = i - readBufferPos;
  2543. if (length != 0)
  2544. {
  2545. if (pureWhite)
  2546. {
  2547. handler.ignorableWhitespace(readBuffer,
  2548. readBufferPos, length);
  2549. }
  2550. else
  2551. {
  2552. handler.charData(readBuffer, readBufferPos, length);
  2553. }
  2554. readBufferPos = i;
  2555. }
  2556. if (state != 0)
  2557. {
  2558. break;
  2559. }
  2560. // fill next buffer from this entity, or
  2561. // pop stack and continue with previous entity
  2562. unread(readCh());
  2563. }
  2564. if (!pureWhite)
  2565. {
  2566. isDirtyCurrentElement = true;
  2567. }
  2568. // finish, maybe with error
  2569. if (state != 1) // finish, no error
  2570. {
  2571. error("character data may not contain ']]>'");
  2572. }
  2573. }
  2574. //////////////////////////////////////////////////////////////////////
  2575. // High-level reading and scanning methods.
  2576. //////////////////////////////////////////////////////////////////////
  2577. /**
  2578. * Require whitespace characters.
  2579. */
  2580. private void requireWhitespace()
  2581. throws SAXException, IOException
  2582. {
  2583. char c = readCh();
  2584. if (isWhitespace(c))
  2585. {
  2586. skipWhitespace();
  2587. }
  2588. else
  2589. {
  2590. error("whitespace required", c, null);
  2591. }
  2592. }
  2593. /**
  2594. * Skip whitespace characters.
  2595. * <pre>
  2596. * [3] S ::= (#x20 | #x9 | #xd | #xa)+
  2597. * </pre>
  2598. */
  2599. private void skipWhitespace()
  2600. throws SAXException, IOException
  2601. {
  2602. // Start with a little cheat. Most of
  2603. // the time, the white space will fall
  2604. // within the current read buffer; if
  2605. // not, then fall through.
  2606. if (USE_CHEATS)
  2607. {
  2608. int lineAugment = 0;
  2609. int columnAugment = 0;
  2610. loop:
  2611. for (int i = readBufferPos; i < readBufferLength; i++)
  2612. {
  2613. switch (readBuffer[i])
  2614. {
  2615. case ' ':
  2616. case '\t':
  2617. case '\r':
  2618. columnAugment++;
  2619. break;
  2620. case '\n':
  2621. lineAugment++;
  2622. columnAugment = 0;
  2623. break;
  2624. case '%':
  2625. if (expandPE)
  2626. {
  2627. break loop;
  2628. }
  2629. // else fall through...
  2630. default:
  2631. readBufferPos = i;
  2632. if (lineAugment > 0)
  2633. {
  2634. line += lineAugment;
  2635. column = columnAugment;
  2636. }
  2637. else
  2638. {
  2639. column += columnAugment;
  2640. }
  2641. return;
  2642. }
  2643. }
  2644. }
  2645. // OK, do it the slow way.
  2646. char c = readCh ();
  2647. while (isWhitespace(c))
  2648. {
  2649. c = readCh();
  2650. }
  2651. unread(c);
  2652. }
  2653. /**
  2654. * Read a name or (when parsing an enumeration) name token.
  2655. * <pre>
  2656. * [5] Name ::= (Letter | '_' | ':') (NameChar)*
  2657. * [7] Nmtoken ::= (NameChar)+
  2658. * </pre>
  2659. */
  2660. private String readNmtoken(boolean isName)
  2661. throws SAXException, IOException
  2662. {
  2663. char c;
  2664. if (USE_CHEATS)
  2665. {
  2666. loop:
  2667. for (int i = readBufferPos; i < readBufferLength; i++)
  2668. {
  2669. c = readBuffer[i];
  2670. switch (c)
  2671. {
  2672. case '%':
  2673. if (expandPE)
  2674. {
  2675. break loop;
  2676. }
  2677. // else fall through...
  2678. // What may legitimately come AFTER a name/nmtoken?
  2679. case '<': case '>': case '&':
  2680. case ',': case '|': case '*': case '+': case '?':
  2681. case ')':
  2682. case '=':
  2683. case '\'': case '"':
  2684. case '[':
  2685. case ' ': case '\t': case '\r': case '\n':
  2686. case ';':
  2687. case '/':
  2688. int start = readBufferPos;
  2689. if (i == start)
  2690. {
  2691. error("name expected", readBuffer[i], null);
  2692. }
  2693. readBufferPos = i;
  2694. return intern(readBuffer, start, i - start);
  2695. default:
  2696. // FIXME ... per IBM's OASIS test submission, these:
  2697. // ? U+06dd
  2698. // Combining U+309B
  2699. //these switches are kind of ugly but at least we won't
  2700. //have to go over the whole lits for each char
  2701. if (isName && i == readBufferPos)
  2702. {
  2703. char c2 = (char) (c & 0x00f0);
  2704. switch (c & 0xff00)
  2705. {
  2706. //starting with 01
  2707. case 0x0100:
  2708. switch (c2)
  2709. {
  2710. case 0x0030:
  2711. if (c == 0x0132 || c == 0x0133 || c == 0x013f)
  2712. {
  2713. error("Not a name start character, U+"
  2714. + Integer.toHexString(c));
  2715. }
  2716. break;
  2717. case 0x0040:
  2718. if (c == 0x0140 || c == 0x0149)
  2719. {
  2720. error("Not a name start character, U+"
  2721. + Integer.toHexString(c));
  2722. }
  2723. break;
  2724. case 0x00c0:
  2725. if (c == 0x01c4 || c == 0x01cc)
  2726. {
  2727. error("Not a name start character, U+"
  2728. + Integer.toHexString(c));
  2729. }
  2730. break;
  2731. case 0x00f0:
  2732. if (c == 0x01f1 || c == 0x01f3)
  2733. {
  2734. error("Not a name start character, U+"
  2735. + Integer.toHexString(c));
  2736. }
  2737. break;
  2738. case 0x00b0:
  2739. if (c == 0x01f1 || c == 0x01f3)
  2740. {
  2741. error("Not a name start character, U+"
  2742. + Integer.toHexString(c));
  2743. }
  2744. break;
  2745. default:
  2746. if (c == 0x017f)
  2747. {
  2748. error("Not a name start character, U+"
  2749. + Integer.toHexString(c));
  2750. }
  2751. }
  2752. break;
  2753. //starting with 11
  2754. case 0x1100:
  2755. switch (c2)
  2756. {
  2757. case 0x0000:
  2758. if (c == 0x1104 || c == 0x1108 ||
  2759. c == 0x110a || c == 0x110d)
  2760. {
  2761. error("Not a name start character, U+"
  2762. + Integer.toHexString(c));
  2763. }
  2764. break;
  2765. case 0x0030:
  2766. if (c == 0x113b || c == 0x113f)
  2767. {
  2768. error("Not a name start character, U+"
  2769. + Integer.toHexString(c));
  2770. }
  2771. break;
  2772. case 0x0040:
  2773. if (c == 0x1141 || c == 0x114d
  2774. || c == 0x114f )
  2775. {
  2776. error("Not a name start character, U+"
  2777. + Integer.toHexString(c));
  2778. }
  2779. break;
  2780. case 0x0050:
  2781. if (c == 0x1151 || c == 0x1156)
  2782. {
  2783. error("Not a name start character, U+"
  2784. + Integer.toHexString(c));
  2785. }
  2786. break;
  2787. case 0x0060:
  2788. if (c == 0x1162 || c == 0x1164
  2789. || c == 0x1166 || c == 0x116b
  2790. || c == 0x116f)
  2791. {
  2792. error("Not a name start character, U+"
  2793. + Integer.toHexString(c));
  2794. }
  2795. break;
  2796. case 0x00b0:
  2797. if (c == 0x11b6 || c == 0x11b9
  2798. || c == 0x11bb || c == 0x116f)
  2799. {
  2800. error("Not a name start character, U+"
  2801. + Integer.toHexString(c));
  2802. }
  2803. break;
  2804. default:
  2805. if (c == 0x1174 || c == 0x119f
  2806. || c == 0x11ac || c == 0x11c3
  2807. || c == 0x11f1)
  2808. {
  2809. error("Not a name start character, U+"
  2810. + Integer.toHexString(c));
  2811. }
  2812. }
  2813. break;
  2814. default:
  2815. if (c == 0x0e46 || c == 0x1011
  2816. || c == 0x212f || c == 0x0587
  2817. || c == 0x0230 )
  2818. {
  2819. error("Not a name start character, U+"
  2820. + Integer.toHexString(c));
  2821. }
  2822. }
  2823. }
  2824. // punt on exact tests from Appendix A; approximate
  2825. // them using the Unicode ID start/part rules
  2826. if (i == readBufferPos && isName)
  2827. {
  2828. if (!Character.isUnicodeIdentifierStart(c)
  2829. && c != ':' && c != '_')
  2830. {
  2831. error("Not a name start character, U+"
  2832. + Integer.toHexString(c));
  2833. }
  2834. }
  2835. else if (!Character.isUnicodeIdentifierPart(c)
  2836. && c != '-' && c != ':' && c != '_' && c != '.'
  2837. && !isExtender(c))
  2838. {
  2839. error("Not a name character, U+"
  2840. + Integer.toHexString(c));
  2841. }
  2842. }
  2843. }
  2844. }
  2845. nameBufferPos = 0;
  2846. // Read the first character.
  2847. while (true)
  2848. {
  2849. c = readCh();
  2850. switch (c)
  2851. {
  2852. case '%':
  2853. case '<': case '>': case '&':
  2854. case ',': case '|': case '*': case '+': case '?':
  2855. case ')':
  2856. case '=':
  2857. case '\'': case '"':
  2858. case '[':
  2859. case ' ': case '\t': case '\n': case '\r':
  2860. case ';':
  2861. case '/':
  2862. unread(c);
  2863. if (nameBufferPos == 0)
  2864. {
  2865. error ("name expected");
  2866. }
  2867. // punt on exact tests from Appendix A, but approximate them
  2868. if (isName
  2869. && !Character.isUnicodeIdentifierStart(nameBuffer[0])
  2870. && ":_".indexOf(nameBuffer[0]) == -1)
  2871. {
  2872. error("Not a name start character, U+"
  2873. + Integer.toHexString(nameBuffer[0]));
  2874. }
  2875. String s = intern(nameBuffer, 0, nameBufferPos);
  2876. nameBufferPos = 0;
  2877. return s;
  2878. default:
  2879. // punt on exact tests from Appendix A, but approximate them
  2880. if ((nameBufferPos != 0 || !isName)
  2881. && !Character.isUnicodeIdentifierPart(c)
  2882. && ":-_.".indexOf(c) == -1
  2883. && !isExtender(c))
  2884. {
  2885. error("Not a name character, U+"
  2886. + Integer.toHexString(c));
  2887. }
  2888. if (nameBufferPos >= nameBuffer.length)
  2889. {
  2890. nameBuffer =
  2891. (char[]) extendArray(nameBuffer,
  2892. nameBuffer.length, nameBufferPos);
  2893. }
  2894. nameBuffer[nameBufferPos++] = c;
  2895. }
  2896. }
  2897. }
  2898. private static boolean isExtender(char c)
  2899. {
  2900. // [88] Extender ::= ...
  2901. return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
  2902. || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
  2903. || (c >= 0x3031 && c <= 0x3035)
  2904. || (c >= 0x309d && c <= 0x309e)
  2905. || (c >= 0x30fc && c <= 0x30fe);
  2906. }
  2907. /**
  2908. * Read a literal. With matching single or double quotes as
  2909. * delimiters (and not embedded!) this is used to parse:
  2910. * <pre>
  2911. * [9] EntityValue ::= ... ([^%&amp;] | PEReference | Reference)* ...
  2912. * [10] AttValue ::= ... ([^<&] | Reference)* ...
  2913. * [11] SystemLiteral ::= ... (URLchar - "'")* ...
  2914. * [12] PubidLiteral ::= ... (PubidChar - "'")* ...
  2915. * </pre>
  2916. * as well as the quoted strings in XML and text declarations
  2917. * (for version, encoding, and standalone) which have their
  2918. * own constraints.
  2919. */
  2920. private String readLiteral(int flags)
  2921. throws SAXException, IOException
  2922. {
  2923. char delim, c;
  2924. int startLine = line;
  2925. boolean saved = expandPE;
  2926. boolean savedReport = doReport;
  2927. // Find the first delimiter.
  2928. delim = readCh();
  2929. if (delim != '"' && delim != '\'')
  2930. {
  2931. error("expected '\"' or \"'\"", delim, null);
  2932. return null;
  2933. }
  2934. inLiteral = true;
  2935. if ((flags & LIT_DISABLE_PE) != 0)
  2936. {
  2937. expandPE = false;
  2938. }
  2939. doReport = false;
  2940. // Each level of input source has its own buffer; remember
  2941. // ours, so we won't read the ending delimiter from any
  2942. // other input source, regardless of entity processing.
  2943. char[] ourBuf = readBuffer;
  2944. // Read the literal.
  2945. try
  2946. {
  2947. c = readCh();
  2948. boolean ampRead = false;
  2949. loop:
  2950. while (! (c == delim && readBuffer == ourBuf))
  2951. {
  2952. switch (c)
  2953. {
  2954. // attributes and public ids are normalized
  2955. // in almost the same ways
  2956. case '\n':
  2957. case '\r':
  2958. if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0)
  2959. {
  2960. c = ' ';
  2961. }
  2962. break;
  2963. case '\t':
  2964. if ((flags & LIT_ATTRIBUTE) != 0)
  2965. {
  2966. c = ' ';
  2967. }
  2968. break;
  2969. case '&':
  2970. c = readCh();
  2971. // Char refs are expanded immediately, except for
  2972. // all the cases where it's deferred.
  2973. if (c == '#')
  2974. {
  2975. if ((flags & LIT_DISABLE_CREF) != 0)
  2976. {
  2977. dataBufferAppend('&');
  2978. break;
  2979. }
  2980. parseCharRef(false /* Do not do flushDataBuffer */);
  2981. // exotic WFness risk: this is an entity literal,
  2982. // dataBuffer [dataBufferPos - 1] == '&', and
  2983. // following chars are a _partial_ entity/char ref
  2984. // It looks like an entity ref ...
  2985. }
  2986. else
  2987. {
  2988. unread(c);
  2989. // Expand it?
  2990. if ((flags & LIT_ENTITY_REF) > 0)
  2991. {
  2992. parseEntityRef(false);
  2993. if (String.valueOf(readBuffer).equals("&#38;"))
  2994. {
  2995. ampRead = true;
  2996. }
  2997. //Is it just data?
  2998. }
  2999. else if ((flags & LIT_DISABLE_EREF) != 0)
  3000. {
  3001. dataBufferAppend('&');
  3002. // OK, it will be an entity ref -- expanded later.
  3003. }
  3004. else
  3005. {
  3006. String name = readNmtoken(true);
  3007. require(';');
  3008. dataBufferAppend('&');
  3009. dataBufferAppend(name);
  3010. dataBufferAppend(';');
  3011. }
  3012. }
  3013. c = readCh();
  3014. continue loop;
  3015. case '<':
  3016. // and why? Perhaps so "&foo;" expands the same
  3017. // inside and outside an attribute?
  3018. if ((flags & LIT_ATTRIBUTE) != 0)
  3019. {
  3020. error("attribute values may not contain '<'");
  3021. }
  3022. break;
  3023. // We don't worry about case '%' and PE refs, readCh does.
  3024. default:
  3025. break;
  3026. }
  3027. dataBufferAppend(c);
  3028. c = readCh();
  3029. }
  3030. }
  3031. catch (EOFException e)
  3032. {
  3033. error("end of input while looking for delimiter (started on line "
  3034. + startLine + ')', null, Character.toString(delim));
  3035. }
  3036. inLiteral = false;
  3037. expandPE = saved;
  3038. doReport = savedReport;
  3039. // Normalise whitespace if necessary.
  3040. if ((flags & LIT_NORMALIZE) > 0)
  3041. {
  3042. dataBufferNormalize();
  3043. }
  3044. // Return the value.
  3045. return dataBufferToString();
  3046. }
  3047. /**
  3048. * Try reading external identifiers.
  3049. * A system identifier is not required for notations.
  3050. * @param inNotation Are we parsing a notation decl?
  3051. * @param isSubset Parsing external subset decl (may be omitted)?
  3052. * @return A three-member String array containing the identifiers,
  3053. * or nulls. Order: public, system, baseURI.
  3054. */
  3055. private ExternalIdentifiers readExternalIds(boolean inNotation,
  3056. boolean isSubset)
  3057. throws Exception
  3058. {
  3059. char c;
  3060. ExternalIdentifiers ids = new ExternalIdentifiers();
  3061. int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
  3062. if (tryRead("PUBLIC"))
  3063. {
  3064. requireWhitespace();
  3065. ids.publicId = readLiteral(LIT_NORMALIZE | LIT_PUBID | flags);
  3066. if (inNotation)
  3067. {
  3068. skipWhitespace();
  3069. c = readCh();
  3070. unread(c);
  3071. if (c == '"' || c == '\'')
  3072. {
  3073. ids.systemId = readLiteral(flags);
  3074. }
  3075. }
  3076. else
  3077. {
  3078. requireWhitespace();
  3079. ids.systemId = readLiteral(flags);
  3080. }
  3081. for (int i = 0; i < ids.publicId.length(); i++)
  3082. {
  3083. c = ids.publicId.charAt(i);
  3084. if (c >= 'a' && c <= 'z')
  3085. {
  3086. continue;
  3087. }
  3088. if (c >= 'A' && c <= 'Z')
  3089. {
  3090. continue;
  3091. }
  3092. if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf(c) != -1)
  3093. {
  3094. continue;
  3095. }
  3096. error("illegal PUBLIC id character U+"
  3097. + Integer.toHexString(c));
  3098. }
  3099. }
  3100. else if (tryRead("SYSTEM"))
  3101. {
  3102. requireWhitespace();
  3103. ids.systemId = readLiteral(flags);
  3104. }
  3105. else if (!isSubset)
  3106. {
  3107. error("missing SYSTEM or PUBLIC keyword");
  3108. }
  3109. if (ids.systemId != null)
  3110. {
  3111. if (ids.systemId.indexOf('#') != -1)
  3112. {
  3113. handler.verror("SYSTEM id has a URI fragment: " + ids.systemId);
  3114. }
  3115. ids.baseUri = handler.getSystemId();
  3116. if (ids.baseUri == null && uriWarnings)
  3117. {
  3118. handler.warn("No base URI; hope URI is absolute: "
  3119. + ids.systemId);
  3120. }
  3121. }
  3122. return ids;
  3123. }
  3124. /**
  3125. * Test if a character is whitespace.
  3126. * <pre>
  3127. * [3] S ::= (#x20 | #x9 | #xd | #xa)+
  3128. * </pre>
  3129. * @param c The character to test.
  3130. * @return true if the character is whitespace.
  3131. */
  3132. private final boolean isWhitespace(char c)
  3133. {
  3134. if (c > 0x20)
  3135. {
  3136. return false;
  3137. }
  3138. if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d)
  3139. {
  3140. return true;
  3141. }
  3142. return false; // illegal ...
  3143. }
  3144. //////////////////////////////////////////////////////////////////////
  3145. // Utility routines.
  3146. //////////////////////////////////////////////////////////////////////
  3147. /**
  3148. * Add a character to the data buffer.
  3149. */
  3150. private void dataBufferAppend(char c)
  3151. {
  3152. // Expand buffer if necessary.
  3153. if (dataBufferPos >= dataBuffer.length)
  3154. {
  3155. dataBuffer = (char[]) extendArray(dataBuffer,
  3156. dataBuffer.length, dataBufferPos);
  3157. }
  3158. dataBuffer[dataBufferPos++] = c;
  3159. }
  3160. /**
  3161. * Add a string to the data buffer.
  3162. */
  3163. private void dataBufferAppend(String s)
  3164. {
  3165. dataBufferAppend(s.toCharArray(), 0, s.length());
  3166. }
  3167. /**
  3168. * Append (part of) a character array to the data buffer.
  3169. */
  3170. private void dataBufferAppend(char[] ch, int start, int length)
  3171. {
  3172. dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length,
  3173. dataBufferPos + length);
  3174. System.arraycopy(ch, start, dataBuffer, dataBufferPos, length);
  3175. dataBufferPos += length;
  3176. }
  3177. /**
  3178. * Normalise space characters in the data buffer.
  3179. */
  3180. private void dataBufferNormalize()
  3181. {
  3182. int i = 0;
  3183. int j = 0;
  3184. int end = dataBufferPos;
  3185. // Skip spaces at the start.
  3186. while (j < end && dataBuffer[j] == ' ')
  3187. {
  3188. j++;
  3189. }
  3190. // Skip whitespace at the end.
  3191. while (end > j && dataBuffer[end - 1] == ' ')
  3192. {
  3193. end --;
  3194. }
  3195. // Start copying to the left.
  3196. while (j < end)
  3197. {
  3198. char c = dataBuffer[j++];
  3199. // Normalise all other spaces to
  3200. // a single space.
  3201. if (c == ' ')
  3202. {
  3203. while (j < end && dataBuffer[j++] == ' ')
  3204. {
  3205. continue;
  3206. }
  3207. dataBuffer[i++] = ' ';
  3208. dataBuffer[i++] = dataBuffer[j - 1];
  3209. }
  3210. else
  3211. {
  3212. dataBuffer[i++] = c;
  3213. }
  3214. }
  3215. // The new length is <= the old one.
  3216. dataBufferPos = i;
  3217. }
  3218. /**
  3219. * Convert the data buffer to a string.
  3220. */
  3221. private String dataBufferToString()
  3222. {
  3223. String s = new String(dataBuffer, 0, dataBufferPos);
  3224. dataBufferPos = 0;
  3225. return s;
  3226. }
  3227. /**
  3228. * Flush the contents of the data buffer to the handler, as
  3229. * appropriate, and reset the buffer for new input.
  3230. */
  3231. private void dataBufferFlush()
  3232. throws SAXException
  3233. {
  3234. if (currentElementContent == CONTENT_ELEMENTS
  3235. && dataBufferPos > 0
  3236. && !inCDATA)
  3237. {
  3238. // We can't just trust the buffer to be whitespace, there
  3239. // are (error) cases when it isn't
  3240. for (int i = 0; i < dataBufferPos; i++)
  3241. {
  3242. if (!isWhitespace(dataBuffer[i]))
  3243. {
  3244. handler.charData(dataBuffer, 0, dataBufferPos);
  3245. dataBufferPos = 0;
  3246. }
  3247. }
  3248. if (dataBufferPos > 0)
  3249. {
  3250. handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos);
  3251. dataBufferPos = 0;
  3252. }
  3253. }
  3254. else if (dataBufferPos > 0)
  3255. {
  3256. handler.charData(dataBuffer, 0, dataBufferPos);
  3257. dataBufferPos = 0;
  3258. }
  3259. }
  3260. /**
  3261. * Require a string to appear, or throw an exception.
  3262. * <p><em>Precondition:</em> Entity expansion is not required.
  3263. * <p><em>Precondition:</em> data buffer has no characters that
  3264. * will get sent to the application.
  3265. */
  3266. private void require(String delim)
  3267. throws SAXException, IOException
  3268. {
  3269. int length = delim.length();
  3270. char[] ch;
  3271. if (length < dataBuffer.length)
  3272. {
  3273. ch = dataBuffer;
  3274. delim.getChars(0, length, ch, 0);
  3275. }
  3276. else
  3277. {
  3278. ch = delim.toCharArray();
  3279. }
  3280. if (USE_CHEATS && length <= (readBufferLength - readBufferPos))
  3281. {
  3282. int offset = readBufferPos;
  3283. for (int i = 0; i < length; i++, offset++)
  3284. {
  3285. if (ch[i] != readBuffer[offset])
  3286. {
  3287. error ("required string", null, delim);
  3288. }
  3289. }
  3290. readBufferPos = offset;
  3291. }
  3292. else
  3293. {
  3294. for (int i = 0; i < length; i++)
  3295. {
  3296. require(ch[i]);
  3297. }
  3298. }
  3299. }
  3300. /**
  3301. * Require a character to appear, or throw an exception.
  3302. */
  3303. private void require(char delim)
  3304. throws SAXException, IOException
  3305. {
  3306. char c = readCh();
  3307. if (c != delim)
  3308. {
  3309. error("required character", c, Character.toString(delim));
  3310. }
  3311. }
  3312. /**
  3313. * Create an interned string from a character array.
  3314. * &AElig;lfred uses this method to create an interned version
  3315. * of all names and name tokens, so that it can test equality
  3316. * with <code>==</code> instead of <code>String.equals ()</code>.
  3317. *
  3318. * <p>This is much more efficient than constructing a non-interned
  3319. * string first, and then interning it.
  3320. *
  3321. * @param ch an array of characters for building the string.
  3322. * @param start the starting position in the array.
  3323. * @param length the number of characters to place in the string.
  3324. * @return an interned string.
  3325. * @see #intern (String)
  3326. * @see java.lang.String#intern
  3327. */
  3328. public String intern(char[] ch, int start, int length)
  3329. {
  3330. int index = 0;
  3331. int hash = 0;
  3332. Object[] bucket;
  3333. // Generate a hash code. This is a widely used string hash,
  3334. // often attributed to Brian Kernighan.
  3335. for (int i = start; i < start + length; i++)
  3336. {
  3337. hash = 31 * hash + ch[i];
  3338. }
  3339. hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH;
  3340. // Get the bucket -- consists of {array,String} pairs
  3341. if ((bucket = symbolTable[hash]) == null)
  3342. {
  3343. // first string in this bucket
  3344. bucket = new Object[8];
  3345. // Search for a matching tuple, and
  3346. // return the string if we find one.
  3347. }
  3348. else
  3349. {
  3350. while (index < bucket.length)
  3351. {
  3352. char[] chFound = (char[]) bucket[index];
  3353. // Stop when we hit an empty entry.
  3354. if (chFound == null)
  3355. {
  3356. break;
  3357. }
  3358. // If they're the same length, check for a match.
  3359. if (chFound.length == length)
  3360. {
  3361. for (int i = 0; i < chFound.length; i++)
  3362. {
  3363. // continue search on failure
  3364. if (ch[start + i] != chFound[i])
  3365. {
  3366. break;
  3367. }
  3368. else if (i == length - 1)
  3369. {
  3370. // That's it, we have a match!
  3371. return (String) bucket[index + 1];
  3372. }
  3373. }
  3374. }
  3375. index += 2;
  3376. }
  3377. // Not found -- we'll have to add it.
  3378. // Do we have to grow the bucket?
  3379. bucket = (Object[]) extendArray(bucket, bucket.length, index);
  3380. }
  3381. symbolTable[hash] = bucket;
  3382. // OK, add it to the end of the bucket -- "local" interning.
  3383. // Intern "globally" to let applications share interning benefits.
  3384. // That is, "!=" and "==" work on our strings, not just equals().
  3385. String s = new String(ch, start, length).intern();
  3386. bucket[index] = s.toCharArray();
  3387. bucket[index + 1] = s;
  3388. return s;
  3389. }
  3390. /**
  3391. * Ensure the capacity of an array, allocating a new one if
  3392. * necessary. Usually extends only for name hash collisions.
  3393. */
  3394. private Object extendArray(Object array, int currentSize, int requiredSize)
  3395. {
  3396. if (requiredSize < currentSize)
  3397. {
  3398. return array;
  3399. }
  3400. else
  3401. {
  3402. Object newArray = null;
  3403. int newSize = currentSize * 2;
  3404. if (newSize <= requiredSize)
  3405. {
  3406. newSize = requiredSize + 1;
  3407. }
  3408. if (array instanceof char[])
  3409. {
  3410. newArray = new char[newSize];
  3411. }
  3412. else if (array instanceof Object[])
  3413. {
  3414. newArray = new Object[newSize];
  3415. }
  3416. else
  3417. {
  3418. throw new RuntimeException();
  3419. }
  3420. System.arraycopy(array, 0, newArray, 0, currentSize);
  3421. return newArray;
  3422. }
  3423. }
  3424. //////////////////////////////////////////////////////////////////////
  3425. // XML query routines.
  3426. //////////////////////////////////////////////////////////////////////
  3427. boolean isStandalone()
  3428. {
  3429. return docIsStandalone;
  3430. }
  3431. //
  3432. // Elements
  3433. //
  3434. private int getContentType(ElementDecl element, int defaultType)
  3435. {
  3436. int retval;
  3437. if (element == null)
  3438. {
  3439. return defaultType;
  3440. }
  3441. retval = element.contentType;
  3442. if (retval == CONTENT_UNDECLARED)
  3443. {
  3444. retval = defaultType;
  3445. }
  3446. return retval;
  3447. }
  3448. /**
  3449. * Look up the content type of an element.
  3450. * @param name The element type name.
  3451. * @return An integer constant representing the content type.
  3452. * @see #CONTENT_UNDECLARED
  3453. * @see #CONTENT_ANY
  3454. * @see #CONTENT_EMPTY
  3455. * @see #CONTENT_MIXED
  3456. * @see #CONTENT_ELEMENTS
  3457. */
  3458. public int getElementContentType(String name)
  3459. {
  3460. ElementDecl element = (ElementDecl) elementInfo.get(name);
  3461. return getContentType(element, CONTENT_UNDECLARED);
  3462. }
  3463. /**
  3464. * Register an element.
  3465. * Array format:
  3466. * [0] element type name
  3467. * [1] content model (mixed, elements only)
  3468. * [2] attribute hash table
  3469. */
  3470. private void setElement(String name, int contentType,
  3471. String contentModel, HashMap attributes)
  3472. throws SAXException
  3473. {
  3474. if (skippedPE)
  3475. {
  3476. return;
  3477. }
  3478. ElementDecl element = (ElementDecl) elementInfo.get(name);
  3479. // first <!ELEMENT ...> or <!ATTLIST ...> for this type?
  3480. if (element == null)
  3481. {
  3482. element = new ElementDecl();
  3483. element.contentType = contentType;
  3484. element.contentModel = contentModel;
  3485. element.attributes = attributes;
  3486. elementInfo.put(name, element);
  3487. return;
  3488. }
  3489. // <!ELEMENT ...> declaration?
  3490. if (contentType != CONTENT_UNDECLARED)
  3491. {
  3492. // ... following an associated <!ATTLIST ...>
  3493. if (element.contentType == CONTENT_UNDECLARED)
  3494. {
  3495. element.contentType = contentType;
  3496. element.contentModel = contentModel;
  3497. }
  3498. else
  3499. {
  3500. // VC: Unique Element Type Declaration
  3501. handler.verror("multiple declarations for element type: "
  3502. + name);
  3503. }
  3504. }
  3505. // first <!ATTLIST ...>, before <!ELEMENT ...> ?
  3506. else if (attributes != null)
  3507. {
  3508. element.attributes = attributes;
  3509. }
  3510. }
  3511. /**
  3512. * Look up the attribute hash table for an element.
  3513. * The hash table is the second item in the element array.
  3514. */
  3515. private HashMap getElementAttributes(String name)
  3516. {
  3517. ElementDecl element = (ElementDecl) elementInfo.get(name);
  3518. return (element == null) ? null : element.attributes;
  3519. }
  3520. //
  3521. // Attributes
  3522. //
  3523. /**
  3524. * Get the declared attributes for an element type.
  3525. * @param elname The name of the element type.
  3526. * @return An iterator over all the attributes declared for
  3527. * a specific element type. The results will be valid only
  3528. * after the DTD (if any) has been parsed.
  3529. * @see #getAttributeType
  3530. * @see #getAttributeEnumeration
  3531. * @see #getAttributeDefaultValueType
  3532. * @see #getAttributeDefaultValue
  3533. * @see #getAttributeExpandedValue
  3534. */
  3535. private Iterator declaredAttributes(ElementDecl element)
  3536. {
  3537. HashMap attlist;
  3538. if (element == null)
  3539. {
  3540. return null;
  3541. }
  3542. if ((attlist = element.attributes) == null)
  3543. {
  3544. return null;
  3545. }
  3546. return attlist.keySet().iterator();
  3547. }
  3548. /**
  3549. * Get the declared attributes for an element type.
  3550. * @param elname The name of the element type.
  3551. * @return An iterator over all the attributes declared for
  3552. * a specific element type. The results will be valid only
  3553. * after the DTD (if any) has been parsed.
  3554. * @see #getAttributeType
  3555. * @see #getAttributeEnumeration
  3556. * @see #getAttributeDefaultValueType
  3557. * @see #getAttributeDefaultValue
  3558. * @see #getAttributeExpandedValue
  3559. */
  3560. public Iterator declaredAttributes(String elname)
  3561. {
  3562. return declaredAttributes((ElementDecl) elementInfo.get(elname));
  3563. }
  3564. /**
  3565. * Retrieve the declared type of an attribute.
  3566. * @param name The name of the associated element.
  3567. * @param aname The name of the attribute.
  3568. * @return An interend string denoting the type, or null
  3569. * indicating an undeclared attribute.
  3570. */
  3571. public String getAttributeType(String name, String aname)
  3572. {
  3573. AttributeDecl attribute = getAttribute(name, aname);
  3574. return (attribute == null) ? null : attribute.type;
  3575. }
  3576. /**
  3577. * Retrieve the allowed values for an enumerated attribute type.
  3578. * @param name The name of the associated element.
  3579. * @param aname The name of the attribute.
  3580. * @return A string containing the token list.
  3581. */
  3582. public String getAttributeEnumeration(String name, String aname)
  3583. {
  3584. AttributeDecl attribute = getAttribute(name, aname);
  3585. // assert: attribute.enumeration is "ENUMERATION" or "NOTATION"
  3586. return (attribute == null) ? null : attribute.enumeration;
  3587. }
  3588. /**
  3589. * Retrieve the default value of a declared attribute.
  3590. * @param name The name of the associated element.
  3591. * @param aname The name of the attribute.
  3592. * @return The default value, or null if the attribute was
  3593. * #IMPLIED or simply undeclared and unspecified.
  3594. * @see #getAttributeExpandedValue
  3595. */
  3596. public String getAttributeDefaultValue(String name, String aname)
  3597. {
  3598. AttributeDecl attribute = getAttribute(name, aname);
  3599. return (attribute == null) ? null : attribute.value;
  3600. }
  3601. /*
  3602. // FIXME: Leaving this in, until W3C finally resolves the confusion
  3603. // between parts of the XML 2nd REC about when entity declararations
  3604. // are guaranteed to be known. Current code matches what section 5.1
  3605. // (conformance) describes, but some readings of the self-contradicting
  3606. // text in 4.1 (the "Entity Declared" WFC and VC) seem to expect that
  3607. // attribute expansion/normalization must be deferred in some cases
  3608. // (just TRY to identify them!).
  3609. * Retrieve the expanded value of a declared attribute.
  3610. * <p>General entities (and char refs) will be expanded (once).
  3611. * @param name The name of the associated element.
  3612. * @param aname The name of the attribute.
  3613. * @return The expanded default value, or null if the attribute was
  3614. * #IMPLIED or simply undeclared
  3615. * @see #getAttributeDefaultValue
  3616. public String getAttributeExpandedValue (String name, String aname)
  3617. throws Exception
  3618. {
  3619. AttributeDecl attribute = getAttribute (name, aname);
  3620. if (attribute == null) {
  3621. return null;
  3622. } else if (attribute.defaultValue == null && attribute.value != null) {
  3623. // we MUST use the same buf for both quotes else the literal
  3624. // can't be properly terminated
  3625. char buf [] = new char [1];
  3626. int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE;
  3627. String type = getAttributeType (name, aname);
  3628. if (type != "CDATA" && type != null)
  3629. flags |= LIT_NORMALIZE;
  3630. buf [0] = '"';
  3631. pushCharArray (null, buf, 0, 1);
  3632. pushString (null, attribute.value);
  3633. pushCharArray (null, buf, 0, 1);
  3634. attribute.defaultValue = readLiteral (flags);
  3635. }
  3636. return attribute.defaultValue;
  3637. }
  3638. */
  3639. /**
  3640. * Retrieve the default value mode of a declared attribute.
  3641. * @see #ATTRIBUTE_DEFAULT_SPECIFIED
  3642. * @see #ATTRIBUTE_DEFAULT_IMPLIED
  3643. * @see #ATTRIBUTE_DEFAULT_REQUIRED
  3644. * @see #ATTRIBUTE_DEFAULT_FIXED
  3645. */
  3646. public int getAttributeDefaultValueType(String name, String aname)
  3647. {
  3648. AttributeDecl attribute = getAttribute(name, aname);
  3649. return (attribute == null) ? ATTRIBUTE_DEFAULT_UNDECLARED :
  3650. attribute.valueType;
  3651. }
  3652. /**
  3653. * Register an attribute declaration for later retrieval.
  3654. * Format:
  3655. * - String type
  3656. * - String default value
  3657. * - int value type
  3658. * - enumeration
  3659. * - processed default value
  3660. */
  3661. private void setAttribute(String elName, String name, String type,
  3662. String enumeration, String value, int valueType)
  3663. throws Exception
  3664. {
  3665. HashMap attlist;
  3666. if (skippedPE)
  3667. {
  3668. return;
  3669. }
  3670. // Create a new hashtable if necessary.
  3671. attlist = getElementAttributes(elName);
  3672. if (attlist == null)
  3673. {
  3674. attlist = new HashMap();
  3675. }
  3676. // ignore multiple attribute declarations!
  3677. if (attlist.get(name) != null)
  3678. {
  3679. // warn ...
  3680. return;
  3681. }
  3682. else
  3683. {
  3684. AttributeDecl attribute = new AttributeDecl();
  3685. attribute.type = type;
  3686. attribute.value = value;
  3687. attribute.valueType = valueType;
  3688. attribute.enumeration = enumeration;
  3689. attlist.put(name, attribute);
  3690. // save; but don't overwrite any existing <!ELEMENT ...>
  3691. setElement(elName, CONTENT_UNDECLARED, null, attlist);
  3692. }
  3693. }
  3694. /**
  3695. * Retrieve the attribute declaration for the given element name and name.
  3696. */
  3697. private AttributeDecl getAttribute(String elName, String name)
  3698. {
  3699. HashMap attlist = getElementAttributes(elName);
  3700. return (attlist == null) ? null : (AttributeDecl) attlist.get(name);
  3701. }
  3702. //
  3703. // Entities
  3704. //
  3705. /**
  3706. * Find the type of an entity.
  3707. * @returns An integer constant representing the entity type.
  3708. * @see #ENTITY_UNDECLARED
  3709. * @see #ENTITY_INTERNAL
  3710. * @see #ENTITY_NDATA
  3711. * @see #ENTITY_TEXT
  3712. */
  3713. public int getEntityType(String ename)
  3714. {
  3715. EntityInfo entity = (EntityInfo) entityInfo.get(ename);
  3716. return (entity == null) ? ENTITY_UNDECLARED : entity.type;
  3717. }
  3718. /**
  3719. * Return an external entity's identifiers.
  3720. * @param ename The name of the external entity.
  3721. * @return The entity's public identifier, system identifier, and base URI.
  3722. * Null if the entity was not declared as an external entity.
  3723. * @see #getEntityType
  3724. */
  3725. public ExternalIdentifiers getEntityIds(String ename)
  3726. {
  3727. EntityInfo entity = (EntityInfo) entityInfo.get(ename);
  3728. return (entity == null) ? null : entity.ids;
  3729. }
  3730. /**
  3731. * Return an internal entity's replacement text.
  3732. * @param ename The name of the internal entity.
  3733. * @return The entity's replacement text, or null if
  3734. * the entity was not declared as an internal entity.
  3735. * @see #getEntityType
  3736. */
  3737. public String getEntityValue(String ename)
  3738. {
  3739. EntityInfo entity = (EntityInfo) entityInfo.get(ename);
  3740. return (entity == null) ? null : entity.value;
  3741. }
  3742. /**
  3743. * Register an entity declaration for later retrieval.
  3744. */
  3745. private void setInternalEntity(String eName, String value)
  3746. throws SAXException
  3747. {
  3748. if (skippedPE)
  3749. {
  3750. return;
  3751. }
  3752. if (entityInfo.get(eName) == null)
  3753. {
  3754. EntityInfo entity = new EntityInfo();
  3755. entity.type = ENTITY_INTERNAL;
  3756. entity.value = value;
  3757. entityInfo.put(eName, entity);
  3758. }
  3759. if (handler.stringInterning)
  3760. {
  3761. if ("lt" == eName || "gt" == eName || "quot" == eName
  3762. || "apos" == eName || "amp" == eName)
  3763. {
  3764. return;
  3765. }
  3766. }
  3767. else
  3768. {
  3769. if ("lt".equals(eName) || "gt".equals(eName) || "quot".equals(eName)
  3770. || "apos".equals(eName) || "amp".equals(eName))
  3771. {
  3772. return;
  3773. }
  3774. }
  3775. handler.getDeclHandler().internalEntityDecl(eName, value);
  3776. }
  3777. /**
  3778. * Register an external entity declaration for later retrieval.
  3779. */
  3780. private void setExternalEntity(String eName, int eClass,
  3781. ExternalIdentifiers ids, String nName)
  3782. {
  3783. if (entityInfo.get(eName) == null)
  3784. {
  3785. EntityInfo entity = new EntityInfo();
  3786. entity.type = eClass;
  3787. entity.ids = ids;
  3788. entity.notationName = nName;
  3789. entityInfo.put(eName, entity);
  3790. }
  3791. }
  3792. //
  3793. // Notations.
  3794. //
  3795. /**
  3796. * Report a notation declaration, checking for duplicates.
  3797. */
  3798. private void setNotation(String nname, ExternalIdentifiers ids)
  3799. throws SAXException
  3800. {
  3801. if (skippedPE)
  3802. {
  3803. return;
  3804. }
  3805. handler.notationDecl(nname, ids.publicId, ids.systemId, ids.baseUri);
  3806. if (notationInfo.get(nname) == null)
  3807. {
  3808. notationInfo.put(nname, nname);
  3809. }
  3810. else
  3811. {
  3812. // VC: Unique Notation Name
  3813. handler.verror("Duplicate notation name decl: " + nname);
  3814. }
  3815. }
  3816. //
  3817. // Location.
  3818. //
  3819. /**
  3820. * Return the current line number.
  3821. */
  3822. public int getLineNumber()
  3823. {
  3824. return line;
  3825. }
  3826. /**
  3827. * Return the current column number.
  3828. */
  3829. public int getColumnNumber()
  3830. {
  3831. return column;
  3832. }
  3833. //////////////////////////////////////////////////////////////////////
  3834. // High-level I/O.
  3835. //////////////////////////////////////////////////////////////////////
  3836. /**
  3837. * Read a single character from the readBuffer.
  3838. * <p>The readDataChunk () method maintains the buffer.
  3839. * <p>If we hit the end of an entity, try to pop the stack and
  3840. * keep going.
  3841. * <p> (This approach doesn't really enforce XML's rules about
  3842. * entity boundaries, but this is not currently a validating
  3843. * parser).
  3844. * <p>This routine also attempts to keep track of the current
  3845. * position in external entities, but it's not entirely accurate.
  3846. * @return The next available input character.
  3847. * @see #unread (char)
  3848. * @see #readDataChunk
  3849. * @see #readBuffer
  3850. * @see #line
  3851. * @return The next character from the current input source.
  3852. */
  3853. private char readCh()
  3854. throws SAXException, IOException
  3855. {
  3856. // As long as there's nothing in the
  3857. // read buffer, try reading more data
  3858. // (for an external entity) or popping
  3859. // the entity stack (for either).
  3860. while (readBufferPos >= readBufferLength)
  3861. {
  3862. switch (sourceType)
  3863. {
  3864. case INPUT_READER:
  3865. case INPUT_STREAM:
  3866. readDataChunk();
  3867. while (readBufferLength < 1)
  3868. {
  3869. popInput();
  3870. if (readBufferLength < 1)
  3871. {
  3872. readDataChunk();
  3873. }
  3874. }
  3875. break;
  3876. default:
  3877. popInput();
  3878. break;
  3879. }
  3880. }
  3881. char c = readBuffer[readBufferPos++];
  3882. if (c == '\n')
  3883. {
  3884. line++;
  3885. column = 0;
  3886. }
  3887. else
  3888. {
  3889. if (c == '<')
  3890. {
  3891. /* the most common return to parseContent () ... NOP */
  3892. }
  3893. else if (((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD)
  3894. || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085)
  3895. && xmlVersion == XML_11))
  3896. {
  3897. error("illegal XML character U+" + Integer.toHexString(c));
  3898. }
  3899. // If we're in the DTD and in a context where PEs get expanded,
  3900. // do so ... 1/14/2000 errata identify those contexts. There
  3901. // are also spots in the internal subset where PE refs are fatal
  3902. // errors, hence yet another flag.
  3903. else if (c == '%' && expandPE)
  3904. {
  3905. if (peIsError)
  3906. {
  3907. error("PE reference within decl in internal subset.");
  3908. }
  3909. parsePEReference();
  3910. return readCh();
  3911. }
  3912. column++;
  3913. }
  3914. return c;
  3915. }
  3916. /**
  3917. * Push a single character back onto the current input stream.
  3918. * <p>This method usually pushes the character back onto
  3919. * the readBuffer.
  3920. * <p>I don't think that this would ever be called with
  3921. * readBufferPos = 0, because the methods always reads a character
  3922. * before unreading it, but just in case, I've added a boundary
  3923. * condition.
  3924. * @param c The character to push back.
  3925. * @see #readCh
  3926. * @see #unread (char[])
  3927. * @see #readBuffer
  3928. */
  3929. private void unread(char c)
  3930. throws SAXException
  3931. {
  3932. // Normal condition.
  3933. if (c == '\n')
  3934. {
  3935. line--;
  3936. column = -1;
  3937. }
  3938. if (readBufferPos > 0)
  3939. {
  3940. readBuffer[--readBufferPos] = c;
  3941. }
  3942. else
  3943. {
  3944. pushString(null, Character.toString(c));
  3945. }
  3946. }
  3947. /**
  3948. * Push a char array back onto the current input stream.
  3949. * <p>NOTE: you must <em>never</em> push back characters that you
  3950. * haven't actually read: use pushString () instead.
  3951. * @see #readCh
  3952. * @see #unread (char)
  3953. * @see #readBuffer
  3954. * @see #pushString
  3955. */
  3956. private void unread(char[] ch, int length)
  3957. throws SAXException
  3958. {
  3959. for (int i = 0; i < length; i++)
  3960. {
  3961. if (ch[i] == '\n')
  3962. {
  3963. line--;
  3964. column = -1;
  3965. }
  3966. }
  3967. if (length < readBufferPos)
  3968. {
  3969. readBufferPos -= length;
  3970. }
  3971. else
  3972. {
  3973. pushCharArray(null, ch, 0, length);
  3974. }
  3975. }
  3976. /**
  3977. * Push, or skip, a new external input source.
  3978. * The source will be some kind of parsed entity, such as a PE
  3979. * (including the external DTD subset) or content for the body.
  3980. *
  3981. * @param url The java.net.URL object for the entity.
  3982. * @see SAXDriver#resolveEntity
  3983. * @see #pushString
  3984. * @see #sourceType
  3985. * @see #pushInput
  3986. * @see #detectEncoding
  3987. * @see #sourceType
  3988. * @see #readBuffer
  3989. */
  3990. private void pushURL(boolean isPE,
  3991. String ename,
  3992. ExternalIdentifiers ids,
  3993. Reader reader,
  3994. InputStream stream,
  3995. String encoding,
  3996. boolean doResolve)
  3997. throws SAXException, IOException
  3998. {
  3999. boolean ignoreEncoding;
  4000. String systemId;
  4001. InputSource source;
  4002. if (!isPE)
  4003. {
  4004. dataBufferFlush();
  4005. }
  4006. scratch.setPublicId(ids.publicId);
  4007. scratch.setSystemId(ids.systemId);
  4008. // See if we should skip or substitute the entity.
  4009. // If we're not skipping, resolving reports startEntity()
  4010. // and updates the (handler's) stack of URIs.
  4011. if (doResolve)
  4012. {
  4013. // assert (stream == null && reader == null && encoding == null)
  4014. source = handler.resolveEntity(isPE, ename, scratch, ids.baseUri);
  4015. if (source == null)
  4016. {
  4017. handler.warn("skipping entity: " + ename);
  4018. handler.skippedEntity(ename);
  4019. if (isPE)
  4020. {
  4021. skippedPE = true;
  4022. }
  4023. return;
  4024. }
  4025. // we might be using alternate IDs/encoding
  4026. systemId = source.getSystemId();
  4027. // The following warning and setting systemId was deleted bcause
  4028. // the application has the option of not setting systemId
  4029. // provided that it has set the characte/byte stream.
  4030. /*
  4031. if (systemId == null) {
  4032. handler.warn ("missing system ID, using " + ids.systemId);
  4033. systemId = ids.systemId;
  4034. }
  4035. */
  4036. }
  4037. else
  4038. {
  4039. // "[document]", or "[dtd]" via getExternalSubset()
  4040. scratch.setCharacterStream(reader);
  4041. scratch.setByteStream(stream);
  4042. scratch.setEncoding(encoding);
  4043. source = scratch;
  4044. systemId = ids.systemId;
  4045. if (handler.stringInterning)
  4046. {
  4047. handler.startExternalEntity(ename, systemId,
  4048. "[document]" == ename);
  4049. }
  4050. else
  4051. {
  4052. handler.startExternalEntity(ename, systemId,
  4053. "[document]".equals(ename));
  4054. }
  4055. }
  4056. // we may have been given I/O streams directly
  4057. if (source.getCharacterStream() != null)
  4058. {
  4059. if (source.getByteStream() != null)
  4060. error("InputSource has two streams!");
  4061. reader = source.getCharacterStream();
  4062. }
  4063. else if (source.getByteStream() != null)
  4064. {
  4065. encoding = source.getEncoding();
  4066. if (encoding == null)
  4067. {
  4068. stream = source.getByteStream();
  4069. }
  4070. else
  4071. {
  4072. try
  4073. {
  4074. reader = new InputStreamReader(source.getByteStream(),
  4075. encoding);
  4076. }
  4077. catch (IOException e)
  4078. {
  4079. stream = source.getByteStream();
  4080. }
  4081. }
  4082. }
  4083. else if (systemId == null)
  4084. {
  4085. error("InputSource has no URI!");
  4086. }
  4087. scratch.setCharacterStream(null);
  4088. scratch.setByteStream(null);
  4089. scratch.setEncoding(null);
  4090. // Push the existing status.
  4091. pushInput(ename);
  4092. // Create a new read buffer.
  4093. // (Note the four-character margin)
  4094. readBuffer = new char[READ_BUFFER_MAX + 4];
  4095. readBufferPos = 0;
  4096. readBufferLength = 0;
  4097. readBufferOverflow = -1;
  4098. is = null;
  4099. line = 1;
  4100. column = 0;
  4101. currentByteCount = 0;
  4102. // If there's an explicit character stream, just
  4103. // ignore encoding declarations.
  4104. if (reader != null)
  4105. {
  4106. sourceType = INPUT_READER;
  4107. this.reader = reader;
  4108. tryEncodingDecl(true);
  4109. return;
  4110. }
  4111. // Else we handle the conversion, and need to ensure
  4112. // it's done right.
  4113. sourceType = INPUT_STREAM;
  4114. if (stream != null)
  4115. {
  4116. is = stream;
  4117. }
  4118. else
  4119. {
  4120. // We have to open our own stream to the URL.
  4121. URL url = new URL(systemId);
  4122. externalEntity = url.openConnection();
  4123. externalEntity.connect();
  4124. is = externalEntity.getInputStream();
  4125. }
  4126. // If we get to here, there must be
  4127. // an InputStream available.
  4128. if (!is.markSupported())
  4129. {
  4130. is = new BufferedInputStream(is);
  4131. }
  4132. // Get any external encoding label.
  4133. if (encoding == null && externalEntity != null)
  4134. {
  4135. // External labels can be untrustworthy; filesystems in
  4136. // particular often have the wrong default for content
  4137. // that wasn't locally originated. Those we autodetect.
  4138. if (!"file".equals(externalEntity.getURL().getProtocol()))
  4139. {
  4140. int temp;
  4141. // application/xml;charset=something;otherAttr=...
  4142. // ... with many variants on 'something'
  4143. encoding = externalEntity.getContentType();
  4144. // MHK code (fix for Saxon 5.5.1/007):
  4145. // protect against encoding==null
  4146. if (encoding == null)
  4147. {
  4148. temp = -1;
  4149. }
  4150. else
  4151. {
  4152. temp = encoding.indexOf("charset");
  4153. }
  4154. // RFC 2376 sez MIME text defaults to ASCII, but since the
  4155. // JDK will create a MIME type out of thin air, we always
  4156. // autodetect when there's no explicit charset attribute.
  4157. if (temp < 0)
  4158. {
  4159. encoding = null; // autodetect
  4160. }
  4161. else
  4162. {
  4163. // only this one attribute
  4164. if ((temp = encoding.indexOf(';')) > 0)
  4165. {
  4166. encoding = encoding.substring(0, temp);
  4167. }
  4168. if ((temp = encoding.indexOf('=', temp + 7)) > 0)
  4169. {
  4170. encoding = encoding.substring(temp + 1);
  4171. // attributes can have comment fields (RFC 822)
  4172. if ((temp = encoding.indexOf('(')) > 0)
  4173. {
  4174. encoding = encoding.substring(0, temp);
  4175. }
  4176. // ... and values may be quoted
  4177. if ((temp = encoding.indexOf('"')) > 0)
  4178. {
  4179. encoding =
  4180. encoding.substring(temp + 1,
  4181. encoding.indexOf('"', temp + 2));
  4182. }
  4183. encoding = encoding.trim();
  4184. }
  4185. else
  4186. {
  4187. handler.warn("ignoring illegal MIME attribute: "
  4188. + encoding);
  4189. encoding = null;
  4190. }
  4191. }
  4192. }
  4193. }
  4194. // if we got an external encoding label, use it ...
  4195. if (encoding != null)
  4196. {
  4197. this.encoding = ENCODING_EXTERNAL;
  4198. setupDecoding(encoding);
  4199. ignoreEncoding = true;
  4200. // ... else autodetect from first bytes.
  4201. }
  4202. else
  4203. {
  4204. detectEncoding();
  4205. ignoreEncoding = false;
  4206. }
  4207. // Read any XML or text declaration.
  4208. // If we autodetected, it may tell us the "real" encoding.
  4209. try
  4210. {
  4211. tryEncodingDecl(ignoreEncoding);
  4212. }
  4213. catch (UnsupportedEncodingException x)
  4214. {
  4215. encoding = x.getMessage();
  4216. // if we don't handle the declared encoding,
  4217. // try letting a JVM InputStreamReader do it
  4218. try
  4219. {
  4220. if (sourceType != INPUT_STREAM)
  4221. {
  4222. throw x;
  4223. }
  4224. is.reset();
  4225. readBufferPos = 0;
  4226. readBufferLength = 0;
  4227. readBufferOverflow = -1;
  4228. line = 1;
  4229. currentByteCount = column = 0;
  4230. sourceType = INPUT_READER;
  4231. this.reader = new InputStreamReader(is, encoding);
  4232. is = null;
  4233. tryEncodingDecl(true);
  4234. }
  4235. catch (IOException e)
  4236. {
  4237. error("unsupported text encoding",
  4238. encoding,
  4239. null);
  4240. }
  4241. }
  4242. }
  4243. /**
  4244. * Check for an encoding declaration. This is the second part of the
  4245. * XML encoding autodetection algorithm, relying on detectEncoding to
  4246. * get to the point that this part can read any encoding declaration
  4247. * in the document (using only US-ASCII characters).
  4248. *
  4249. * <p> Because this part starts to fill parser buffers with this data,
  4250. * it's tricky to setup a reader so that Java's built-in decoders can be
  4251. * used for the character encodings that aren't built in to this parser
  4252. * (such as EUC-JP, KOI8-R, Big5, etc).
  4253. *
  4254. * @return any encoding in the declaration, uppercased; or null
  4255. * @see detectEncoding
  4256. */
  4257. private String tryEncodingDecl(boolean ignoreEncoding)
  4258. throws SAXException, IOException
  4259. {
  4260. // Read the XML/text declaration.
  4261. if (tryRead("<?xml"))
  4262. {
  4263. if (tryWhitespace())
  4264. {
  4265. if (inputStack.size() > 0)
  4266. {
  4267. return parseTextDecl(ignoreEncoding);
  4268. }
  4269. else
  4270. {
  4271. return parseXMLDecl(ignoreEncoding);
  4272. }
  4273. }
  4274. else
  4275. {
  4276. // <?xml-stylesheet ...?> or similar
  4277. unread('l');
  4278. unread('m');
  4279. unread('x');
  4280. unread('?');
  4281. unread('<');
  4282. }
  4283. }
  4284. return null;
  4285. }
  4286. /**
  4287. * Attempt to detect the encoding of an entity.
  4288. * <p>The trick here (as suggested in the XML standard) is that
  4289. * any entity not in UTF-8, or in UCS-2 with a byte-order mark,
  4290. * <b>must</b> begin with an XML declaration or an encoding
  4291. * declaration; we simply have to look for "&lt;?xml" in various
  4292. * encodings.
  4293. * <p>This method has no way to distinguish among 8-bit encodings.
  4294. * Instead, it sets up for UTF-8, then (possibly) revises its assumption
  4295. * later in setupDecoding (). Any ASCII-derived 8-bit encoding
  4296. * should work, but most will be rejected later by setupDecoding ().
  4297. * @see #tryEncoding (byte[], byte, byte, byte, byte)
  4298. * @see #tryEncoding (byte[], byte, byte)
  4299. * @see #setupDecoding
  4300. */
  4301. private void detectEncoding()
  4302. throws SAXException, IOException
  4303. {
  4304. byte[] signature = new byte[4];
  4305. // Read the first four bytes for
  4306. // autodetection.
  4307. is.mark(4);
  4308. is.read(signature);
  4309. is.reset();
  4310. //
  4311. // FIRST: four byte encodings (who uses these?)
  4312. //
  4313. if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
  4314. (byte) 0x00, (byte) 0x3c))
  4315. {
  4316. // UCS-4 must begin with "<?xml"
  4317. // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234)
  4318. // "UTF-32BE"
  4319. encoding = ENCODING_UCS_4_1234;
  4320. }
  4321. else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
  4322. (byte) 0x00, (byte) 0x00))
  4323. {
  4324. // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321)
  4325. // "UTF-32LE"
  4326. encoding = ENCODING_UCS_4_4321;
  4327. }
  4328. else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
  4329. (byte) 0x3c, (byte) 0x00))
  4330. {
  4331. // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143)
  4332. encoding = ENCODING_UCS_4_2143;
  4333. }
  4334. else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
  4335. (byte) 0x00, (byte) 0x00))
  4336. {
  4337. // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421)
  4338. encoding = ENCODING_UCS_4_3412;
  4339. // 00 00 fe ff UCS_4_1234 (with BOM)
  4340. // ff fe 00 00 UCS_4_4321 (with BOM)
  4341. }
  4342. //
  4343. // SECOND: two byte encodings
  4344. // note ... with 1/14/2000 errata the XML spec identifies some
  4345. // more "broken UTF-16" autodetection cases, with no XML decl,
  4346. // which we don't handle here (that's legal too).
  4347. //
  4348. else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff))
  4349. {
  4350. // UCS-2 with a byte-order marker. (UTF-16)
  4351. // 0xfe 0xff: UCS-2, big-endian (12)
  4352. encoding = ENCODING_UCS_2_12;
  4353. is.read(); is.read();
  4354. }
  4355. else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe))
  4356. {
  4357. // UCS-2 with a byte-order marker. (UTF-16)
  4358. // 0xff 0xfe: UCS-2, little-endian (21)
  4359. encoding = ENCODING_UCS_2_21;
  4360. is.read(); is.read();
  4361. }
  4362. else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
  4363. (byte) 0x00, (byte) 0x3f))
  4364. {
  4365. // UTF-16BE (otherwise, malformed UTF-16)
  4366. // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark
  4367. encoding = ENCODING_UCS_2_12;
  4368. error("no byte-order mark for UCS-2 entity");
  4369. }
  4370. else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
  4371. (byte) 0x3f, (byte) 0x00))
  4372. {
  4373. // UTF-16LE (otherwise, malformed UTF-16)
  4374. // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark
  4375. encoding = ENCODING_UCS_2_21;
  4376. error("no byte-order mark for UCS-2 entity");
  4377. }
  4378. //
  4379. // THIRD: ASCII-derived encodings, fixed and variable lengths
  4380. //
  4381. else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f,
  4382. (byte) 0x78, (byte) 0x6d))
  4383. {
  4384. // ASCII derived
  4385. // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING)
  4386. encoding = ENCODING_UTF_8;
  4387. prefetchASCIIEncodingDecl();
  4388. }
  4389. else if (signature[0] == (byte) 0xef
  4390. && signature[1] == (byte) 0xbb
  4391. && signature[2] == (byte) 0xbf)
  4392. {
  4393. // 0xef 0xbb 0xbf: UTF-8 BOM (not part of document text)
  4394. // this un-needed notion slipped into XML 2nd ed through a
  4395. // "non-normative" erratum; now required by MSFT and UDDI,
  4396. // and E22 made it normative.
  4397. encoding = ENCODING_UTF_8;
  4398. is.read(); is.read(); is.read();
  4399. }
  4400. else
  4401. {
  4402. // 4c 6f a7 94 ... we don't understand EBCDIC flavors
  4403. // ... but we COULD at least kick in some fixed code page
  4404. // (default) UTF-8 without encoding/XML declaration
  4405. encoding = ENCODING_UTF_8;
  4406. }
  4407. }
  4408. /**
  4409. * Check for a four-byte signature.
  4410. * <p>Utility routine for detectEncoding ().
  4411. * <p>Always looks for some part of "<?XML" in a specific encoding.
  4412. * @param sig The first four bytes read.
  4413. * @param b1 The first byte of the signature
  4414. * @param b2 The second byte of the signature
  4415. * @param b3 The third byte of the signature
  4416. * @param b4 The fourth byte of the signature
  4417. * @see #detectEncoding
  4418. */
  4419. private static boolean tryEncoding(byte[] sig, byte b1, byte b2,
  4420. byte b3, byte b4)
  4421. {
  4422. return (sig[0] == b1 && sig[1] == b2
  4423. && sig[2] == b3 && sig[3] == b4);
  4424. }
  4425. /**
  4426. * Check for a two-byte signature.
  4427. * <p>Looks for a UCS-2 byte-order mark.
  4428. * <p>Utility routine for detectEncoding ().
  4429. * @param sig The first four bytes read.
  4430. * @param b1 The first byte of the signature
  4431. * @param b2 The second byte of the signature
  4432. * @see #detectEncoding
  4433. */
  4434. private static boolean tryEncoding(byte[] sig, byte b1, byte b2)
  4435. {
  4436. return ((sig[0] == b1) && (sig[1] == b2));
  4437. }
  4438. /**
  4439. * This method pushes a string back onto input.
  4440. * <p>It is useful either as the expansion of an internal entity,
  4441. * or for backtracking during the parse.
  4442. * <p>Call pushCharArray () to do the actual work.
  4443. * @param s The string to push back onto input.
  4444. * @see #pushCharArray
  4445. */
  4446. private void pushString(String ename, String s)
  4447. throws SAXException
  4448. {
  4449. char[] ch = s.toCharArray();
  4450. pushCharArray(ename, ch, 0, ch.length);
  4451. }
  4452. /**
  4453. * Push a new internal input source.
  4454. * <p>This method is useful for expanding an internal entity,
  4455. * or for unreading a string of characters. It creates a new
  4456. * readBuffer containing the characters in the array, instead
  4457. * of characters converted from an input byte stream.
  4458. * @param ch The char array to push.
  4459. * @see #pushString
  4460. * @see #pushURL
  4461. * @see #readBuffer
  4462. * @see #sourceType
  4463. * @see #pushInput
  4464. */
  4465. private void pushCharArray(String ename, char[] ch, int start, int length)
  4466. throws SAXException
  4467. {
  4468. // Push the existing status
  4469. pushInput(ename);
  4470. if (ename != null && doReport)
  4471. {
  4472. dataBufferFlush();
  4473. handler.startInternalEntity(ename);
  4474. }
  4475. sourceType = INPUT_INTERNAL;
  4476. readBuffer = ch;
  4477. readBufferPos = start;
  4478. readBufferLength = length;
  4479. readBufferOverflow = -1;
  4480. }
  4481. /**
  4482. * Save the current input source onto the stack.
  4483. * <p>This method saves all of the global variables associated with
  4484. * the current input source, so that they can be restored when a new
  4485. * input source has finished. It also tests for entity recursion.
  4486. * <p>The method saves the following global variables onto a stack
  4487. * using a fixed-length array:
  4488. * <ol>
  4489. * <li>sourceType
  4490. * <li>externalEntity
  4491. * <li>readBuffer
  4492. * <li>readBufferPos
  4493. * <li>readBufferLength
  4494. * <li>line
  4495. * <li>encoding
  4496. * </ol>
  4497. * @param ename The name of the entity (if any) causing the new input.
  4498. * @see #popInput
  4499. * @see #sourceType
  4500. * @see #externalEntity
  4501. * @see #readBuffer
  4502. * @see #readBufferPos
  4503. * @see #readBufferLength
  4504. * @see #line
  4505. * @see #encoding
  4506. */
  4507. private void pushInput(String ename)
  4508. throws SAXException
  4509. {
  4510. // Check for entity recursion.
  4511. if (ename != null)
  4512. {
  4513. Iterator entities = entityStack.iterator();
  4514. while (entities.hasNext())
  4515. {
  4516. String e = (String) entities.next();
  4517. if (e != null && e == ename)
  4518. {
  4519. error("recursive reference to entity", ename, null);
  4520. }
  4521. }
  4522. }
  4523. entityStack.addLast(ename);
  4524. // Don't bother if there is no current input.
  4525. if (sourceType == INPUT_NONE)
  4526. {
  4527. return;
  4528. }
  4529. // Set up a snapshot of the current
  4530. // input source.
  4531. Input input = new Input();
  4532. input.sourceType = sourceType;
  4533. input.externalEntity = externalEntity;
  4534. input.readBuffer = readBuffer;
  4535. input.readBufferPos = readBufferPos;
  4536. input.readBufferLength = readBufferLength;
  4537. input.line = line;
  4538. input.encoding = encoding;
  4539. input.readBufferOverflow = readBufferOverflow;
  4540. input.is = is;
  4541. input.currentByteCount = currentByteCount;
  4542. input.column = column;
  4543. input.reader = reader;
  4544. // Push it onto the stack.
  4545. inputStack.addLast(input);
  4546. }
  4547. /**
  4548. * Restore a previous input source.
  4549. * <p>This method restores all of the global variables associated with
  4550. * the current input source.
  4551. * @exception java.io.EOFException
  4552. * If there are no more entries on the input stack.
  4553. * @see #pushInput
  4554. * @see #sourceType
  4555. * @see #externalEntity
  4556. * @see #readBuffer
  4557. * @see #readBufferPos
  4558. * @see #readBufferLength
  4559. * @see #line
  4560. * @see #encoding
  4561. */
  4562. private void popInput()
  4563. throws SAXException, IOException
  4564. {
  4565. String ename = (String) entityStack.removeLast();
  4566. if (ename != null && doReport)
  4567. {
  4568. dataBufferFlush();
  4569. }
  4570. switch (sourceType)
  4571. {
  4572. case INPUT_STREAM:
  4573. handler.endExternalEntity(ename);
  4574. is.close();
  4575. break;
  4576. case INPUT_READER:
  4577. handler.endExternalEntity(ename);
  4578. reader.close();
  4579. break;
  4580. case INPUT_INTERNAL:
  4581. if (ename != null && doReport)
  4582. {
  4583. handler.endInternalEntity(ename);
  4584. }
  4585. break;
  4586. }
  4587. // Throw an EOFException if there
  4588. // is nothing else to pop.
  4589. if (inputStack.isEmpty())
  4590. {
  4591. throw new EOFException("no more input");
  4592. }
  4593. Input input = (Input) inputStack.removeLast();
  4594. sourceType = input.sourceType;
  4595. externalEntity = input.externalEntity;
  4596. readBuffer = input.readBuffer;
  4597. readBufferPos = input.readBufferPos;
  4598. readBufferLength = input.readBufferLength;
  4599. line = input.line;
  4600. encoding = input.encoding;
  4601. readBufferOverflow = input.readBufferOverflow;
  4602. is = input.is;
  4603. currentByteCount = input.currentByteCount;
  4604. column = input.column;
  4605. reader = input.reader;
  4606. }
  4607. /**
  4608. * Return true if we can read the expected character.
  4609. * <p>Note that the character will be removed from the input stream
  4610. * on success, but will be put back on failure. Do not attempt to
  4611. * read the character again if the method succeeds.
  4612. * @param delim The character that should appear next. For a
  4613. * insensitive match, you must supply this in upper-case.
  4614. * @return true if the character was successfully read, or false if
  4615. * it was not.
  4616. * @see #tryRead (String)
  4617. */
  4618. private boolean tryRead(char delim)
  4619. throws SAXException, IOException
  4620. {
  4621. char c;
  4622. // Read the character
  4623. c = readCh();
  4624. // Test for a match, and push the character
  4625. // back if the match fails.
  4626. if (c == delim)
  4627. {
  4628. return true;
  4629. }
  4630. else
  4631. {
  4632. unread(c);
  4633. return false;
  4634. }
  4635. }
  4636. /**
  4637. * Return true if we can read the expected string.
  4638. * <p>This is simply a convenience method.
  4639. * <p>Note that the string will be removed from the input stream
  4640. * on success, but will be put back on failure. Do not attempt to
  4641. * read the string again if the method succeeds.
  4642. * <p>This method will push back a character rather than an
  4643. * array whenever possible (probably the majority of cases).
  4644. * @param delim The string that should appear next.
  4645. * @return true if the string was successfully read, or false if
  4646. * it was not.
  4647. * @see #tryRead (char)
  4648. */
  4649. private boolean tryRead(String delim)
  4650. throws SAXException, IOException
  4651. {
  4652. return tryRead(delim.toCharArray());
  4653. }
  4654. private boolean tryRead(char[] ch)
  4655. throws SAXException, IOException
  4656. {
  4657. char c;
  4658. // Compare the input, character-
  4659. // by character.
  4660. for (int i = 0; i < ch.length; i++)
  4661. {
  4662. c = readCh();
  4663. if (c != ch[i])
  4664. {
  4665. unread(c);
  4666. if (i != 0)
  4667. {
  4668. unread(ch, i);
  4669. }
  4670. return false;
  4671. }
  4672. }
  4673. return true;
  4674. }
  4675. /**
  4676. * Return true if we can read some whitespace.
  4677. * <p>This is simply a convenience method.
  4678. * <p>This method will push back a character rather than an
  4679. * array whenever possible (probably the majority of cases).
  4680. * @return true if whitespace was found.
  4681. */
  4682. private boolean tryWhitespace()
  4683. throws SAXException, IOException
  4684. {
  4685. char c;
  4686. c = readCh();
  4687. if (isWhitespace(c))
  4688. {
  4689. skipWhitespace();
  4690. return true;
  4691. }
  4692. else
  4693. {
  4694. unread(c);
  4695. return false;
  4696. }
  4697. }
  4698. /**
  4699. * Read all data until we find the specified string.
  4700. * This is useful for scanning CDATA sections and PIs.
  4701. * <p>This is inefficient right now, since it calls tryRead ()
  4702. * for every character.
  4703. * @param delim The string delimiter
  4704. * @see #tryRead (String, boolean)
  4705. * @see #readCh
  4706. */
  4707. private void parseUntil(String delim)
  4708. throws SAXException, IOException
  4709. {
  4710. parseUntil(delim.toCharArray());
  4711. }
  4712. private void parseUntil(char[] delim)
  4713. throws SAXException, IOException
  4714. {
  4715. char c;
  4716. int startLine = line;
  4717. try
  4718. {
  4719. while (!tryRead(delim))
  4720. {
  4721. c = readCh();
  4722. dataBufferAppend(c);
  4723. }
  4724. }
  4725. catch (EOFException e)
  4726. {
  4727. error("end of input while looking for delimiter "
  4728. + "(started on line " + startLine
  4729. + ')', null, new String(delim));
  4730. }
  4731. }
  4732. //////////////////////////////////////////////////////////////////////
  4733. // Low-level I/O.
  4734. //////////////////////////////////////////////////////////////////////
  4735. /**
  4736. * Prefetch US-ASCII XML/text decl from input stream into read buffer.
  4737. * Doesn't buffer more than absolutely needed, so that when an encoding
  4738. * decl says we need to create an InputStreamReader, we can discard our
  4739. * buffer and reset(). Caller knows the first chars of the decl exist
  4740. * in the input stream.
  4741. */
  4742. private void prefetchASCIIEncodingDecl()
  4743. throws SAXException, IOException
  4744. {
  4745. int ch;
  4746. readBufferPos = readBufferLength = 0;
  4747. is.mark(readBuffer.length);
  4748. while (true)
  4749. {
  4750. ch = is.read();
  4751. readBuffer[readBufferLength++] = (char) ch;
  4752. switch (ch)
  4753. {
  4754. case (int) '>':
  4755. return;
  4756. case -1:
  4757. error("file ends before end of XML or encoding declaration.",
  4758. null, "?>");
  4759. }
  4760. if (readBuffer.length == readBufferLength)
  4761. {
  4762. error("unfinished XML or encoding declaration");
  4763. }
  4764. }
  4765. }
  4766. /**
  4767. * Read a chunk of data from an external input source.
  4768. * <p>This is simply a front-end that fills the rawReadBuffer
  4769. * with bytes, then calls the appropriate encoding handler.
  4770. * @see #encoding
  4771. * @see #rawReadBuffer
  4772. * @see #readBuffer
  4773. * @see #filterCR
  4774. * @see #copyUtf8ReadBuffer
  4775. * @see #copyIso8859_1ReadBuffer
  4776. * @see #copyUcs_2ReadBuffer
  4777. * @see #copyUcs_4ReadBuffer
  4778. */
  4779. private void readDataChunk()
  4780. throws SAXException, IOException
  4781. {
  4782. int count;
  4783. // See if we have any overflow (filterCR sets for CR at end)
  4784. if (readBufferOverflow > -1)
  4785. {
  4786. readBuffer[0] = (char) readBufferOverflow;
  4787. readBufferOverflow = -1;
  4788. readBufferPos = 1;
  4789. sawCR = true;
  4790. }
  4791. else
  4792. {
  4793. readBufferPos = 0;
  4794. sawCR = false;
  4795. }
  4796. // input from a character stream.
  4797. if (sourceType == INPUT_READER)
  4798. {
  4799. count = reader.read(readBuffer,
  4800. readBufferPos, READ_BUFFER_MAX - readBufferPos);
  4801. if (count < 0)
  4802. {
  4803. readBufferLength = readBufferPos;
  4804. }
  4805. else
  4806. {
  4807. readBufferLength = readBufferPos + count;
  4808. }
  4809. if (readBufferLength > 0)
  4810. {
  4811. filterCR(count >= 0);
  4812. }
  4813. sawCR = false;
  4814. return;
  4815. }
  4816. // Read as many bytes as possible into the raw buffer.
  4817. count = is.read(rawReadBuffer, 0, READ_BUFFER_MAX);
  4818. // Dispatch to an encoding-specific reader method to populate
  4819. // the readBuffer. In most parser speed profiles, these routines
  4820. // show up at the top of the CPU usage chart.
  4821. if (count > 0)
  4822. {
  4823. switch (encoding)
  4824. {
  4825. // one byte builtins
  4826. case ENCODING_ASCII:
  4827. copyIso8859_1ReadBuffer(count, (char) 0x0080);
  4828. break;
  4829. case ENCODING_UTF_8:
  4830. copyUtf8ReadBuffer(count);
  4831. break;
  4832. case ENCODING_ISO_8859_1:
  4833. copyIso8859_1ReadBuffer(count, (char) 0);
  4834. break;
  4835. // two byte builtins
  4836. case ENCODING_UCS_2_12:
  4837. copyUcs2ReadBuffer(count, 8, 0);
  4838. break;
  4839. case ENCODING_UCS_2_21:
  4840. copyUcs2ReadBuffer(count, 0, 8);
  4841. break;
  4842. // four byte builtins
  4843. case ENCODING_UCS_4_1234:
  4844. copyUcs4ReadBuffer(count, 24, 16, 8, 0);
  4845. break;
  4846. case ENCODING_UCS_4_4321:
  4847. copyUcs4ReadBuffer(count, 0, 8, 16, 24);
  4848. break;
  4849. case ENCODING_UCS_4_2143:
  4850. copyUcs4ReadBuffer(count, 16, 24, 0, 8);
  4851. break;
  4852. case ENCODING_UCS_4_3412:
  4853. copyUcs4ReadBuffer(count, 8, 0, 24, 16);
  4854. break;
  4855. }
  4856. }
  4857. else
  4858. {
  4859. readBufferLength = readBufferPos;
  4860. }
  4861. readBufferPos = 0;
  4862. // Filter out all carriage returns if we've seen any
  4863. // (including any saved from a previous read)
  4864. if (sawCR)
  4865. {
  4866. filterCR(count >= 0);
  4867. sawCR = false;
  4868. // must actively report EOF, lest some CRs get lost.
  4869. if (readBufferLength == 0 && count >= 0)
  4870. {
  4871. readDataChunk();
  4872. }
  4873. }
  4874. if (count > 0)
  4875. {
  4876. currentByteCount += count;
  4877. }
  4878. }
  4879. /**
  4880. * Filter carriage returns in the read buffer.
  4881. * CRLF becomes LF; CR becomes LF.
  4882. * @param moreData true iff more data might come from the same source
  4883. * @see #readDataChunk
  4884. * @see #readBuffer
  4885. * @see #readBufferOverflow
  4886. */
  4887. private void filterCR(boolean moreData)
  4888. {
  4889. int i, j;
  4890. readBufferOverflow = -1;
  4891. loop:
  4892. for (i = j = readBufferPos; j < readBufferLength; i++, j++)
  4893. {
  4894. switch (readBuffer[j])
  4895. {
  4896. case '\r':
  4897. if (j == readBufferLength - 1)
  4898. {
  4899. if (moreData)
  4900. {
  4901. readBufferOverflow = '\r';
  4902. readBufferLength--;
  4903. }
  4904. else // CR at end of buffer
  4905. {
  4906. readBuffer[i++] = '\n';
  4907. }
  4908. break loop;
  4909. }
  4910. else if (readBuffer[j + 1] == '\n')
  4911. {
  4912. j++;
  4913. }
  4914. readBuffer[i] = '\n';
  4915. break;
  4916. case '\n':
  4917. default:
  4918. readBuffer[i] = readBuffer[j];
  4919. break;
  4920. }
  4921. }
  4922. readBufferLength = i;
  4923. }
  4924. /**
  4925. * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters.
  4926. * <p>When readDataChunk () calls this method, the raw bytes are in
  4927. * rawReadBuffer, and the final characters will appear in
  4928. * readBuffer.
  4929. * <p>Note that as of Unicode 3.1, good practice became a requirement,
  4930. * so that each Unicode character has exactly one UTF-8 representation.
  4931. * @param count The number of bytes to convert.
  4932. * @see #readDataChunk
  4933. * @see #rawReadBuffer
  4934. * @see #readBuffer
  4935. * @see #getNextUtf8Byte
  4936. */
  4937. private void copyUtf8ReadBuffer(int count)
  4938. throws SAXException, IOException
  4939. {
  4940. int i = 0;
  4941. int j = readBufferPos;
  4942. int b1;
  4943. char c = 0;
  4944. /*
  4945. // check once, so the runtime won't (if it's smart enough)
  4946. if (count < 0 || count > rawReadBuffer.length)
  4947. throw new ArrayIndexOutOfBoundsException (Integer.toString (count));
  4948. */
  4949. while (i < count)
  4950. {
  4951. b1 = rawReadBuffer[i++];
  4952. // Determine whether we are dealing
  4953. // with a one-, two-, three-, or four-
  4954. // byte sequence.
  4955. if (b1 < 0)
  4956. {
  4957. if ((b1 & 0xe0) == 0xc0)
  4958. {
  4959. // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
  4960. c = (char) (((b1 & 0x1f) << 6)
  4961. | getNextUtf8Byte(i++, count));
  4962. if (c < 0x0080)
  4963. {
  4964. encodingError("Illegal two byte UTF-8 sequence",
  4965. c, 0);
  4966. }
  4967. //Sec 2.11
  4968. // [1] the two-character sequence #xD #xA
  4969. // [2] the two-character sequence #xD #x85
  4970. if ((c == 0x0085 || c == 0x000a) && sawCR)
  4971. {
  4972. continue;
  4973. }
  4974. // Sec 2.11
  4975. // [3] the single character #x85
  4976. if (c == 0x0085 && xmlVersion == XML_11)
  4977. {
  4978. readBuffer[j++] = '\r';
  4979. }
  4980. }
  4981. else if ((b1 & 0xf0) == 0xe0)
  4982. {
  4983. // 3-byte sequence:
  4984. // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
  4985. // most CJKV characters
  4986. c = (char) (((b1 & 0x0f) << 12) |
  4987. (getNextUtf8Byte(i++, count) << 6) |
  4988. getNextUtf8Byte(i++, count));
  4989. //sec 2.11
  4990. //[4] the single character #x2028
  4991. if (c == 0x2028 && xmlVersion == XML_11)
  4992. {
  4993. readBuffer[j++] = '\r';
  4994. sawCR = true;
  4995. continue;
  4996. }
  4997. if (c < 0x0800 || (c >= 0xd800 && c <= 0xdfff))
  4998. {
  4999. encodingError("Illegal three byte UTF-8 sequence",
  5000. c, 0);
  5001. }
  5002. }
  5003. else if ((b1 & 0xf8) == 0xf0)
  5004. {
  5005. // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
  5006. // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
  5007. // (uuuuu = wwww + 1)
  5008. // "Surrogate Pairs" ... from the "Astral Planes"
  5009. // Unicode 3.1 assigned the first characters there
  5010. int iso646 = b1 & 07;
  5011. iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count);
  5012. iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count);
  5013. iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count);
  5014. if (iso646 <= 0xffff)
  5015. {
  5016. encodingError("Illegal four byte UTF-8 sequence",
  5017. iso646, 0);
  5018. }
  5019. else
  5020. {
  5021. if (iso646 > 0x0010ffff)
  5022. {
  5023. encodingError("UTF-8 value out of range for Unicode",
  5024. iso646, 0);
  5025. }
  5026. iso646 -= 0x010000;
  5027. readBuffer[j++] = (char) (0xd800 | (iso646 >> 10));
  5028. readBuffer[j++] = (char) (0xdc00 | (iso646 & 0x03ff));
  5029. continue;
  5030. }
  5031. }
  5032. else
  5033. {
  5034. // The five and six byte encodings aren't supported;
  5035. // they exceed the Unicode (and XML) range.
  5036. encodingError("unsupported five or six byte UTF-8 sequence",
  5037. 0xff & b1, i);
  5038. // NOTREACHED
  5039. c = 0;
  5040. }
  5041. }
  5042. else
  5043. {
  5044. // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
  5045. // (US-ASCII character, "common" case, one branch to here)
  5046. c = (char) b1;
  5047. }
  5048. readBuffer[j++] = c;
  5049. if (c == '\r')
  5050. {
  5051. sawCR = true;
  5052. }
  5053. }
  5054. // How many characters have we read?
  5055. readBufferLength = j;
  5056. }
  5057. /**
  5058. * Return the next byte value in a UTF-8 sequence.
  5059. * If it is not possible to get a byte from the current
  5060. * entity, throw an exception.
  5061. * @param pos The current position in the rawReadBuffer.
  5062. * @param count The number of bytes in the rawReadBuffer
  5063. * @return The significant six bits of a non-initial byte in
  5064. * a UTF-8 sequence.
  5065. * @exception EOFException If the sequence is incomplete.
  5066. */
  5067. private int getNextUtf8Byte(int pos, int count)
  5068. throws SAXException, IOException
  5069. {
  5070. int val;
  5071. // Take a character from the buffer
  5072. // or from the actual input stream.
  5073. if (pos < count)
  5074. {
  5075. val = rawReadBuffer[pos];
  5076. }
  5077. else
  5078. {
  5079. val = is.read();
  5080. if (val == -1)
  5081. {
  5082. encodingError("unfinished multi-byte UTF-8 sequence at EOF",
  5083. -1, pos);
  5084. }
  5085. }
  5086. // Check for the correct bits at the start.
  5087. if ((val & 0xc0) != 0x80)
  5088. {
  5089. encodingError("bad continuation of multi-byte UTF-8 sequence",
  5090. val, pos + 1);
  5091. }
  5092. // Return the significant bits.
  5093. return (val & 0x3f);
  5094. }
  5095. /**
  5096. * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into
  5097. * UTF-16 characters.
  5098. *
  5099. * <p>When readDataChunk () calls this method, the raw bytes are in
  5100. * rawReadBuffer, and the final characters will appear in
  5101. * readBuffer.
  5102. *
  5103. * @param count The number of bytes to convert.
  5104. * @param mask For ASCII conversion, 0x7f; else, 0xff.
  5105. * @see #readDataChunk
  5106. * @see #rawReadBuffer
  5107. * @see #readBuffer
  5108. */
  5109. private void copyIso8859_1ReadBuffer(int count, char mask)
  5110. throws IOException
  5111. {
  5112. int i, j;
  5113. for (i = 0, j = readBufferPos; i < count; i++, j++)
  5114. {
  5115. char c = (char) (rawReadBuffer[i] & 0xff);
  5116. if ((c & mask) != 0)
  5117. {
  5118. throw new CharConversionException("non-ASCII character U+"
  5119. + Integer.toHexString(c));
  5120. }
  5121. if (c == 0x0085 && xmlVersion == XML_11)
  5122. {
  5123. c = '\r';
  5124. }
  5125. readBuffer[j] = c;
  5126. if (c == '\r')
  5127. {
  5128. sawCR = true;
  5129. }
  5130. }
  5131. readBufferLength = j;
  5132. }
  5133. /**
  5134. * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters
  5135. * (as used in Java string manipulation).
  5136. *
  5137. * <p>When readDataChunk () calls this method, the raw bytes are in
  5138. * rawReadBuffer, and the final characters will appear in
  5139. * readBuffer.
  5140. * @param count The number of bytes to convert.
  5141. * @param shift1 The number of bits to shift byte 1.
  5142. * @param shift2 The number of bits to shift byte 2
  5143. * @see #readDataChunk
  5144. * @see #rawReadBuffer
  5145. * @see #readBuffer
  5146. */
  5147. private void copyUcs2ReadBuffer(int count, int shift1, int shift2)
  5148. throws SAXException
  5149. {
  5150. int j = readBufferPos;
  5151. if (count > 0 && (count % 2) != 0)
  5152. {
  5153. encodingError("odd number of bytes in UCS-2 encoding", -1, count);
  5154. }
  5155. // The loops are faster with less internal brancing; hence two
  5156. if (shift1 == 0)
  5157. { // "UTF-16-LE"
  5158. for (int i = 0; i < count; i += 2)
  5159. {
  5160. char c = (char) (rawReadBuffer[i + 1] << 8);
  5161. c |= 0xff & rawReadBuffer[i];
  5162. readBuffer[j++] = c;
  5163. if (c == '\r')
  5164. {
  5165. sawCR = true;
  5166. }
  5167. }
  5168. }
  5169. else
  5170. { // "UTF-16-BE"
  5171. for (int i = 0; i < count; i += 2)
  5172. {
  5173. char c = (char) (rawReadBuffer[i] << 8);
  5174. c |= 0xff & rawReadBuffer[i + 1];
  5175. readBuffer[j++] = c;
  5176. if (c == '\r')
  5177. {
  5178. sawCR = true;
  5179. }
  5180. }
  5181. }
  5182. readBufferLength = j;
  5183. }
  5184. /**
  5185. * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters.
  5186. *
  5187. * <p>When readDataChunk () calls this method, the raw bytes are in
  5188. * rawReadBuffer, and the final characters will appear in
  5189. * readBuffer.
  5190. * <p>Java has Unicode chars, and this routine uses surrogate pairs
  5191. * for ISO-10646 values between 0x00010000 and 0x000fffff. An
  5192. * exception is thrown if the ISO-10646 character has no Unicode
  5193. * representation.
  5194. *
  5195. * @param count The number of bytes to convert.
  5196. * @param shift1 The number of bits to shift byte 1.
  5197. * @param shift2 The number of bits to shift byte 2
  5198. * @param shift3 The number of bits to shift byte 2
  5199. * @param shift4 The number of bits to shift byte 2
  5200. * @see #readDataChunk
  5201. * @see #rawReadBuffer
  5202. * @see #readBuffer
  5203. */
  5204. private void copyUcs4ReadBuffer(int count, int shift1, int shift2,
  5205. int shift3, int shift4)
  5206. throws SAXException
  5207. {
  5208. int j = readBufferPos;
  5209. if (count > 0 && (count % 4) != 0)
  5210. {
  5211. encodingError("number of bytes in UCS-4 encoding " +
  5212. "not divisible by 4",
  5213. -1, count);
  5214. }
  5215. for (int i = 0; i < count; i += 4)
  5216. {
  5217. int value = (((rawReadBuffer [i] & 0xff) << shift1) |
  5218. ((rawReadBuffer [i + 1] & 0xff) << shift2) |
  5219. ((rawReadBuffer [i + 2] & 0xff) << shift3) |
  5220. ((rawReadBuffer [i + 3] & 0xff) << shift4));
  5221. if (value < 0x0000ffff)
  5222. {
  5223. readBuffer [j++] = (char) value;
  5224. if (value == (int) '\r')
  5225. {
  5226. sawCR = true;
  5227. }
  5228. }
  5229. else if (value < 0x0010ffff)
  5230. {
  5231. value -= 0x010000;
  5232. readBuffer[j++] = (char) (0xd8 | ((value >> 10) & 0x03ff));
  5233. readBuffer[j++] = (char) (0xdc | (value & 0x03ff));
  5234. }
  5235. else
  5236. {
  5237. encodingError("UCS-4 value out of range for Unicode",
  5238. value, i);
  5239. }
  5240. }
  5241. readBufferLength = j;
  5242. }
  5243. /**
  5244. * Report a character encoding error.
  5245. */
  5246. private void encodingError(String message, int value, int offset)
  5247. throws SAXException
  5248. {
  5249. if (value != -1)
  5250. {
  5251. message = message + " (character code: 0x" +
  5252. Integer.toHexString(value) + ')';
  5253. error(message);
  5254. }
  5255. }
  5256. //////////////////////////////////////////////////////////////////////
  5257. // Local Variables.
  5258. //////////////////////////////////////////////////////////////////////
  5259. /**
  5260. * Re-initialize the variables for each parse.
  5261. */
  5262. private void initializeVariables()
  5263. {
  5264. // First line
  5265. line = 1;
  5266. column = 0;
  5267. // Set up the buffers for data and names
  5268. dataBufferPos = 0;
  5269. dataBuffer = new char[DATA_BUFFER_INITIAL];
  5270. nameBufferPos = 0;
  5271. nameBuffer = new char[NAME_BUFFER_INITIAL];
  5272. // Set up the DTD hash tables
  5273. elementInfo = new HashMap();
  5274. entityInfo = new HashMap();
  5275. notationInfo = new HashMap();
  5276. skippedPE = false;
  5277. // Set up the variables for the current
  5278. // element context.
  5279. currentElement = null;
  5280. currentElementContent = CONTENT_UNDECLARED;
  5281. // Set up the input variables
  5282. sourceType = INPUT_NONE;
  5283. inputStack = new LinkedList();
  5284. entityStack = new LinkedList();
  5285. externalEntity = null;
  5286. tagAttributePos = 0;
  5287. tagAttributes = new String[100];
  5288. rawReadBuffer = new byte[READ_BUFFER_MAX];
  5289. readBufferOverflow = -1;
  5290. scratch = new InputSource();
  5291. inLiteral = false;
  5292. expandPE = false;
  5293. peIsError = false;
  5294. doReport = false;
  5295. inCDATA = false;
  5296. symbolTable = new Object[SYMBOL_TABLE_LENGTH][];
  5297. }
  5298. static class ExternalIdentifiers
  5299. {
  5300. String publicId;
  5301. String systemId;
  5302. String baseUri;
  5303. ExternalIdentifiers()
  5304. {
  5305. }
  5306. ExternalIdentifiers(String publicId, String systemId, String baseUri)
  5307. {
  5308. this.publicId = publicId;
  5309. this.systemId = systemId;
  5310. this.baseUri = baseUri;
  5311. }
  5312. }
  5313. static class EntityInfo
  5314. {
  5315. int type;
  5316. ExternalIdentifiers ids;
  5317. String value;
  5318. String notationName;
  5319. }
  5320. static class AttributeDecl
  5321. {
  5322. String type;
  5323. String value;
  5324. int valueType;
  5325. String enumeration;
  5326. String defaultValue;
  5327. }
  5328. static class ElementDecl
  5329. {
  5330. int contentType;
  5331. String contentModel;
  5332. HashMap attributes;
  5333. }
  5334. static class Input
  5335. {
  5336. int sourceType;
  5337. URLConnection externalEntity;
  5338. char[] readBuffer;
  5339. int readBufferPos;
  5340. int readBufferLength;
  5341. int line;
  5342. int encoding;
  5343. int readBufferOverflow;
  5344. InputStream is;
  5345. int currentByteCount;
  5346. int column;
  5347. Reader reader;
  5348. }
  5349. }