HTML5.php 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. <?php
  2. namespace Masterminds;
  3. use Masterminds\HTML5\Parser\DOMTreeBuilder;
  4. use Masterminds\HTML5\Parser\Scanner;
  5. use Masterminds\HTML5\Parser\Tokenizer;
  6. use Masterminds\HTML5\Serializer\OutputRules;
  7. use Masterminds\HTML5\Serializer\Traverser;
  8. /**
  9. * This class offers convenience methods for parsing and serializing HTML5.
  10. * It is roughly designed to mirror the \DOMDocument native class.
  11. */
  12. class HTML5
  13. {
  14. /**
  15. * Global options for the parser and serializer.
  16. *
  17. * @var array
  18. */
  19. private $defaultOptions = array(
  20. // Whether the serializer should aggressively encode all characters as entities.
  21. 'encode_entities' => false,
  22. // Prevents the parser from automatically assigning the HTML5 namespace to the DOM document.
  23. 'disable_html_ns' => false,
  24. );
  25. protected $errors = array();
  26. public function __construct(array $defaultOptions = array())
  27. {
  28. $this->defaultOptions = array_merge($this->defaultOptions, $defaultOptions);
  29. }
  30. /**
  31. * Get the current default options.
  32. *
  33. * @return array
  34. */
  35. public function getOptions()
  36. {
  37. return $this->defaultOptions;
  38. }
  39. /**
  40. * Load and parse an HTML file.
  41. *
  42. * This will apply the HTML5 parser, which is tolerant of many
  43. * varieties of HTML, including XHTML 1, HTML 4, and well-formed HTML
  44. * 3. Note that in these cases, not all of the old data will be
  45. * preserved. For example, XHTML's XML declaration will be removed.
  46. *
  47. * The rules governing parsing are set out in the HTML 5 spec.
  48. *
  49. * @param string|resource $file The path to the file to parse. If this is a resource, it is
  50. * assumed to be an open stream whose pointer is set to the first
  51. * byte of input.
  52. * @param array $options Configuration options when parsing the HTML.
  53. *
  54. * @return \DOMDocument A DOM document. These object type is defined by the libxml
  55. * library, and should have been included with your version of PHP.
  56. */
  57. public function load($file, array $options = array())
  58. {
  59. // Handle the case where file is a resource.
  60. if (is_resource($file)) {
  61. return $this->parse(stream_get_contents($file), $options);
  62. }
  63. return $this->parse(file_get_contents($file), $options);
  64. }
  65. /**
  66. * Parse a HTML Document from a string.
  67. *
  68. * Take a string of HTML 5 (or earlier) and parse it into a
  69. * DOMDocument.
  70. *
  71. * @param string $string A html5 document as a string.
  72. * @param array $options Configuration options when parsing the HTML.
  73. *
  74. * @return \DOMDocument A DOM document. DOM is part of libxml, which is included with
  75. * almost all distribtions of PHP.
  76. */
  77. public function loadHTML($string, array $options = array())
  78. {
  79. return $this->parse($string, $options);
  80. }
  81. /**
  82. * Convenience function to load an HTML file.
  83. *
  84. * This is here to provide backwards compatibility with the
  85. * PHP DOM implementation. It simply calls load().
  86. *
  87. * @param string $file The path to the file to parse. If this is a resource, it is
  88. * assumed to be an open stream whose pointer is set to the first
  89. * byte of input.
  90. * @param array $options Configuration options when parsing the HTML.
  91. *
  92. * @return \DOMDocument A DOM document. These object type is defined by the libxml
  93. * library, and should have been included with your version of PHP.
  94. */
  95. public function loadHTMLFile($file, array $options = array())
  96. {
  97. return $this->load($file, $options);
  98. }
  99. /**
  100. * Parse a HTML fragment from a string.
  101. *
  102. * @param string $string the HTML5 fragment as a string
  103. * @param array $options Configuration options when parsing the HTML
  104. *
  105. * @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with
  106. * almost all distributions of PHP.
  107. */
  108. public function loadHTMLFragment($string, array $options = array())
  109. {
  110. return $this->parseFragment($string, $options);
  111. }
  112. /**
  113. * Return all errors encountered into parsing phase.
  114. *
  115. * @return array
  116. */
  117. public function getErrors()
  118. {
  119. return $this->errors;
  120. }
  121. /**
  122. * Return true it some errors were encountered into parsing phase.
  123. *
  124. * @return bool
  125. */
  126. public function hasErrors()
  127. {
  128. return count($this->errors) > 0;
  129. }
  130. /**
  131. * Parse an input string.
  132. *
  133. * @param string $input
  134. * @param array $options
  135. *
  136. * @return \DOMDocument
  137. */
  138. public function parse($input, array $options = array())
  139. {
  140. $this->errors = array();
  141. $options = array_merge($this->defaultOptions, $options);
  142. $events = new DOMTreeBuilder(false, $options);
  143. $scanner = new Scanner($input, !empty($options['encoding']) ? $options['encoding'] : 'UTF-8');
  144. $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML : Tokenizer::CONFORMANT_HTML);
  145. $parser->parse();
  146. $this->errors = $events->getErrors();
  147. return $events->document();
  148. }
  149. /**
  150. * Parse an input stream where the stream is a fragment.
  151. *
  152. * Lower-level loading function. This requires an input stream instead
  153. * of a string, file, or resource.
  154. *
  155. * @param string $input The input data to parse in the form of a string.
  156. * @param array $options An array of options.
  157. *
  158. * @return \DOMDocumentFragment
  159. */
  160. public function parseFragment($input, array $options = array())
  161. {
  162. $options = array_merge($this->defaultOptions, $options);
  163. $events = new DOMTreeBuilder(true, $options);
  164. $scanner = new Scanner($input, !empty($options['encoding']) ? $options['encoding'] : 'UTF-8');
  165. $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML : Tokenizer::CONFORMANT_HTML);
  166. $parser->parse();
  167. $this->errors = $events->getErrors();
  168. return $events->fragment();
  169. }
  170. /**
  171. * Save a DOM into a given file as HTML5.
  172. *
  173. * @param mixed $dom The DOM to be serialized.
  174. * @param string|resource $file The filename to be written or resource to write to.
  175. * @param array $options Configuration options when serializing the DOM. These include:
  176. * - encode_entities: Text written to the output is escaped by default and not all
  177. * entities are encoded. If this is set to true all entities will be encoded.
  178. * Defaults to false.
  179. */
  180. public function save($dom, $file, $options = array())
  181. {
  182. $close = true;
  183. if (is_resource($file)) {
  184. $stream = $file;
  185. $close = false;
  186. } else {
  187. $stream = fopen($file, 'wb');
  188. }
  189. $options = array_merge($this->defaultOptions, $options);
  190. $rules = new OutputRules($stream, $options);
  191. $trav = new Traverser($dom, $stream, $rules, $options);
  192. $trav->walk();
  193. /*
  194. * release the traverser to avoid cyclic references and allow PHP to free memory without waiting for gc_collect_cycles
  195. */
  196. $rules->unsetTraverser();
  197. if ($close) {
  198. fclose($stream);
  199. }
  200. }
  201. /**
  202. * Convert a DOM into an HTML5 string.
  203. *
  204. * @param mixed $dom The DOM to be serialized.
  205. * @param array $options Configuration options when serializing the DOM. These include:
  206. * - encode_entities: Text written to the output is escaped by default and not all
  207. * entities are encoded. If this is set to true all entities will be encoded.
  208. * Defaults to false.
  209. *
  210. * @return string A HTML5 documented generated from the DOM.
  211. */
  212. public function saveHTML($dom, $options = array())
  213. {
  214. $stream = fopen('php://temp', 'wb');
  215. $this->save($dom, $stream, array_merge($this->defaultOptions, $options));
  216. $html = stream_get_contents($stream, -1, 0);
  217. fclose($stream);
  218. return $html;
  219. }
  220. }