Parse.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. <?php
  2. /**
  3. * This module implements a VERY limited parser that finds <link> tags
  4. * in the head of HTML or XHTML documents and parses out their
  5. * attributes according to the OpenID spec. It is a liberal parser,
  6. * but it requires these things from the data in order to work:
  7. *
  8. * - There must be an open <html> tag
  9. *
  10. * - There must be an open <head> tag inside of the <html> tag
  11. *
  12. * - Only <link>s that are found inside of the <head> tag are parsed
  13. * (this is by design)
  14. *
  15. * - The parser follows the OpenID specification in resolving the
  16. * attributes of the link tags. This means that the attributes DO
  17. * NOT get resolved as they would by an XML or HTML parser. In
  18. * particular, only certain entities get replaced, and href
  19. * attributes do not get resolved relative to a base URL.
  20. *
  21. * From http://openid.net/specs.bml:
  22. *
  23. * - The openid.server URL MUST be an absolute URL. OpenID consumers
  24. * MUST NOT attempt to resolve relative URLs.
  25. *
  26. * - The openid.server URL MUST NOT include entities other than &amp;,
  27. * &lt;, &gt;, and &quot;.
  28. *
  29. * The parser ignores SGML comments and <![CDATA[blocks]]>. Both kinds
  30. * of quoting are allowed for attributes.
  31. *
  32. * The parser deals with invalid markup in these ways:
  33. *
  34. * - Tag names are not case-sensitive
  35. *
  36. * - The <html> tag is accepted even when it is not at the top level
  37. *
  38. * - The <head> tag is accepted even when it is not a direct child of
  39. * the <html> tag, but a <html> tag must be an ancestor of the
  40. * <head> tag
  41. *
  42. * - <link> tags are accepted even when they are not direct children
  43. * of the <head> tag, but a <head> tag must be an ancestor of the
  44. * <link> tag
  45. *
  46. * - If there is no closing tag for an open <html> or <head> tag, the
  47. * remainder of the document is viewed as being inside of the
  48. * tag. If there is no closing tag for a <link> tag, the link tag is
  49. * treated as a short tag. Exceptions to this rule are that <html>
  50. * closes <html> and <body> or <head> closes <head>
  51. *
  52. * - Attributes of the <link> tag are not required to be quoted.
  53. *
  54. * - In the case of duplicated attribute names, the attribute coming
  55. * last in the tag will be the value returned.
  56. *
  57. * - Any text that does not parse as an attribute within a link tag
  58. * will be ignored. (e.g. <link pumpkin rel='openid.server' /> will
  59. * ignore pumpkin)
  60. *
  61. * - If there are more than one <html> or <head> tag, the parser only
  62. * looks inside of the first one.
  63. *
  64. * - The contents of <script> tags are ignored entirely, except
  65. * unclosed <script> tags. Unclosed <script> tags are ignored.
  66. *
  67. * - Any other invalid markup is ignored, including unclosed SGML
  68. * comments and unclosed <![CDATA[blocks.
  69. *
  70. * PHP versions 4 and 5
  71. *
  72. * LICENSE: See the COPYING file included in this distribution.
  73. *
  74. * @access private
  75. * @package OpenID
  76. * @author JanRain, Inc. <openid@janrain.com>
  77. * @copyright 2005-2008 Janrain, Inc.
  78. * @license http://www.apache.org/licenses/LICENSE-2.0 Apache
  79. */
  80. /**
  81. * Require Auth_OpenID::arrayGet().
  82. */
  83. require_once "Auth/OpenID.php";
  84. class Auth_OpenID_Parse {
  85. /**
  86. * Specify some flags for use with regex matching.
  87. */
  88. var $_re_flags = "si";
  89. /**
  90. * Stuff to remove before we start looking for tags
  91. */
  92. var $_removed_re =
  93. "<!--.*?-->|<!\[CDATA\[.*?\]\]>|<script\b(?!:)[^>]*>.*?<\/script>";
  94. /**
  95. * Starts with the tag name at a word boundary, where the tag name
  96. * is not a namespace
  97. */
  98. var $_tag_expr = "<%s\b(?!:)([^>]*?)(?:\/>|>(.*)(?:<\/?%s\s*>|\Z))";
  99. var $_attr_find = '\b(\w+)=("[^"]*"|\'[^\']*\'|[^\'"\s\/<>]+)';
  100. var $_open_tag_expr = "<%s\b";
  101. var $_close_tag_expr = "<((\/%s\b)|(%s[^>\/]*\/))>";
  102. function Auth_OpenID_Parse()
  103. {
  104. $this->_link_find = sprintf("/<link\b(?!:)([^>]*)(?!<)>/%s",
  105. $this->_re_flags);
  106. $this->_entity_replacements = array(
  107. 'amp' => '&',
  108. 'lt' => '<',
  109. 'gt' => '>',
  110. 'quot' => '"'
  111. );
  112. $this->_attr_find = sprintf("/%s/%s",
  113. $this->_attr_find,
  114. $this->_re_flags);
  115. $this->_removed_re = sprintf("/%s/%s",
  116. $this->_removed_re,
  117. $this->_re_flags);
  118. $this->_ent_replace =
  119. sprintf("&(%s);", implode("|",
  120. $this->_entity_replacements));
  121. }
  122. /**
  123. * Returns a regular expression that will match a given tag in an
  124. * SGML string.
  125. */
  126. function tagMatcher($tag_name, $close_tags = null)
  127. {
  128. $expr = $this->_tag_expr;
  129. if ($close_tags) {
  130. $options = implode("|", array_merge(array($tag_name), $close_tags));
  131. $closer = sprintf("(?:%s)", $options);
  132. } else {
  133. $closer = $tag_name;
  134. }
  135. $expr = sprintf($expr, $tag_name, $closer);
  136. return sprintf("/%s/%s", $expr, $this->_re_flags);
  137. }
  138. function openTag($tag_name)
  139. {
  140. $expr = sprintf($this->_open_tag_expr, $tag_name);
  141. return sprintf("/%s/%s", $expr, $this->_re_flags);
  142. }
  143. function closeTag($tag_name)
  144. {
  145. $expr = sprintf($this->_close_tag_expr, $tag_name, $tag_name);
  146. return sprintf("/%s/%s", $expr, $this->_re_flags);
  147. }
  148. function htmlBegin($s)
  149. {
  150. $matches = array();
  151. $result = preg_match($this->openTag('html'), $s,
  152. $matches, PREG_OFFSET_CAPTURE);
  153. if ($result === false || !$matches) {
  154. return false;
  155. }
  156. // Return the offset of the first match.
  157. return $matches[0][1];
  158. }
  159. function htmlEnd($s)
  160. {
  161. $matches = array();
  162. $result = preg_match($this->closeTag('html'), $s,
  163. $matches, PREG_OFFSET_CAPTURE);
  164. if ($result === false || !$matches) {
  165. return false;
  166. }
  167. // Return the offset of the first match.
  168. return $matches[count($matches) - 1][1];
  169. }
  170. function headFind()
  171. {
  172. return $this->tagMatcher('head', array('body', 'html'));
  173. }
  174. function replaceEntities($str)
  175. {
  176. foreach ($this->_entity_replacements as $old => $new) {
  177. $str = preg_replace(sprintf("/&%s;/", $old), $new, $str);
  178. }
  179. return $str;
  180. }
  181. function removeQuotes($str)
  182. {
  183. $matches = array();
  184. $double = '/^"(.*)"$/';
  185. $single = "/^\'(.*)\'$/";
  186. if (preg_match($double, $str, $matches)) {
  187. return $matches[1];
  188. } else if (preg_match($single, $str, $matches)) {
  189. return $matches[1];
  190. } else {
  191. return $str;
  192. }
  193. }
  194. function match($regexp, $text, &$match)
  195. {
  196. if (preg_match($regexp, $text, $match)) {
  197. return true;
  198. }
  199. return false;
  200. }
  201. /**
  202. * Find all link tags in a string representing a HTML document and
  203. * return a list of their attributes.
  204. *
  205. * @todo This is quite ineffective and may fail with the default
  206. * pcre.backtrack_limit of 100000 in PHP 5.2, if $html is big.
  207. * It should rather use stripos (in PHP5) or strpos()+strtoupper()
  208. * in PHP4 to manage this.
  209. *
  210. * @param string $html The text to parse
  211. * @return array $list An array of arrays of attributes, one for each
  212. * link tag
  213. */
  214. function parseLinkAttrs($html)
  215. {
  216. $stripped = preg_replace($this->_removed_re,
  217. "",
  218. $html);
  219. $html_begin = $this->htmlBegin($stripped);
  220. $html_end = $this->htmlEnd($stripped);
  221. if ($html_begin === false) {
  222. return array();
  223. }
  224. if ($html_end === false) {
  225. $html_end = strlen($stripped);
  226. }
  227. $stripped = substr($stripped, $html_begin,
  228. $html_end - $html_begin);
  229. // Workaround to prevent PREG_BACKTRACK_LIMIT_ERROR:
  230. $old_btlimit = ini_set( 'pcre.backtrack_limit', -1 );
  231. // Try to find the <HEAD> tag.
  232. $head_re = $this->headFind();
  233. $head_match = array();
  234. if (!$this->match($head_re, $stripped, $head_match)) {
  235. ini_set( 'pcre.backtrack_limit', $old_btlimit );
  236. return array();
  237. }
  238. $link_data = array();
  239. $link_matches = array();
  240. if (!preg_match_all($this->_link_find, $head_match[0],
  241. $link_matches)) {
  242. ini_set( 'pcre.backtrack_limit', $old_btlimit );
  243. return array();
  244. }
  245. foreach ($link_matches[0] as $link) {
  246. $attr_matches = array();
  247. preg_match_all($this->_attr_find, $link, $attr_matches);
  248. $link_attrs = array();
  249. foreach ($attr_matches[0] as $index => $full_match) {
  250. $name = $attr_matches[1][$index];
  251. $value = $this->replaceEntities(
  252. $this->removeQuotes($attr_matches[2][$index]));
  253. $link_attrs[strtolower($name)] = $value;
  254. }
  255. $link_data[] = $link_attrs;
  256. }
  257. ini_set( 'pcre.backtrack_limit', $old_btlimit );
  258. return $link_data;
  259. }
  260. function relMatches($rel_attr, $target_rel)
  261. {
  262. // Does this target_rel appear in the rel_str?
  263. // XXX: TESTME
  264. $rels = preg_split("/\s+/", trim($rel_attr));
  265. foreach ($rels as $rel) {
  266. $rel = strtolower($rel);
  267. if ($rel == $target_rel) {
  268. return 1;
  269. }
  270. }
  271. return 0;
  272. }
  273. function linkHasRel($link_attrs, $target_rel)
  274. {
  275. // Does this link have target_rel as a relationship?
  276. // XXX: TESTME
  277. $rel_attr = Auth_OpeniD::arrayGet($link_attrs, 'rel', null);
  278. return ($rel_attr && $this->relMatches($rel_attr,
  279. $target_rel));
  280. }
  281. function findLinksRel($link_attrs_list, $target_rel)
  282. {
  283. // Filter the list of link attributes on whether it has
  284. // target_rel as a relationship.
  285. // XXX: TESTME
  286. $result = array();
  287. foreach ($link_attrs_list as $attr) {
  288. if ($this->linkHasRel($attr, $target_rel)) {
  289. $result[] = $attr;
  290. }
  291. }
  292. return $result;
  293. }
  294. function findFirstHref($link_attrs_list, $target_rel)
  295. {
  296. // Return the value of the href attribute for the first link
  297. // tag in the list that has target_rel as a relationship.
  298. // XXX: TESTME
  299. $matches = $this->findLinksRel($link_attrs_list,
  300. $target_rel);
  301. if (!$matches) {
  302. return null;
  303. }
  304. $first = $matches[0];
  305. return Auth_OpenID::arrayGet($first, 'href', null);
  306. }
  307. }
  308. function Auth_OpenID_legacy_discover($html_text, $server_rel,
  309. $delegate_rel)
  310. {
  311. $p = new Auth_OpenID_Parse();
  312. $link_attrs = $p->parseLinkAttrs($html_text);
  313. $server_url = $p->findFirstHref($link_attrs,
  314. $server_rel);
  315. if ($server_url === null) {
  316. return false;
  317. } else {
  318. $delegate_url = $p->findFirstHref($link_attrs,
  319. $delegate_rel);
  320. return array($delegate_url, $server_url);
  321. }
  322. }