feeddiscovery.php 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. <?php
  2. /*
  3. * StatusNet - the distributed open-source microblogging tool
  4. * Copyright (C) 2009, StatusNet, Inc.
  5. *
  6. * This program is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU Affero General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU Affero General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Affero General Public License
  17. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. /**
  20. * @package FeedSubPlugin
  21. * @maintainer Brion Vibber <brion@status.net>
  22. */
  23. if (!defined('STATUSNET')) {
  24. exit(1);
  25. }
  26. class FeedSubBadURLException extends FeedSubException
  27. {
  28. }
  29. class FeedSubBadResponseException extends FeedSubException
  30. {
  31. }
  32. class FeedSubEmptyException extends FeedSubException
  33. {
  34. }
  35. class FeedSubBadHTMLException extends FeedSubException
  36. {
  37. }
  38. class FeedSubUnrecognizedTypeException extends FeedSubException
  39. {
  40. }
  41. class FeedSubNoFeedException extends FeedSubException
  42. {
  43. }
  44. class FeedSubBadXmlException extends FeedSubException
  45. {
  46. }
  47. class FeedSubNoHubException extends FeedSubException
  48. {
  49. }
  50. /**
  51. * Given a web page or feed URL, discover the final location of the feed
  52. * and return its current contents.
  53. *
  54. * @example
  55. * $feed = new FeedDiscovery();
  56. * if ($feed->discoverFromURL($url)) {
  57. * print $feed->uri;
  58. * print $feed->type;
  59. * processFeed($feed->feed); // DOMDocument
  60. * }
  61. */
  62. class FeedDiscovery
  63. {
  64. public $uri;
  65. public $type;
  66. public $feed;
  67. public $root;
  68. /** Post-initialize query helper... */
  69. public function getLink($rel, $type=null)
  70. {
  71. // @fixme check for non-Atom links in RSS2 feeds as well
  72. return self::getAtomLink($rel, $type);
  73. }
  74. public function getAtomLink($rel, $type=null)
  75. {
  76. return ActivityUtils::getLink($this->root, $rel, $type);
  77. }
  78. /**
  79. * Get the referenced PuSH hub link from an Atom feed.
  80. *
  81. * @return mixed string or false
  82. */
  83. public function getHubLink()
  84. {
  85. return $this->getAtomLink('hub');
  86. }
  87. /**
  88. * @param string $url
  89. * @param bool $htmlOk pass false here if you don't want to follow web pages.
  90. * @return string with validated URL
  91. * @throws FeedSubBadURLException
  92. * @throws FeedSubBadHtmlException
  93. * @throws FeedSubNoFeedException
  94. * @throws FeedSubEmptyException
  95. * @throws FeedSubUnrecognizedTypeException
  96. */
  97. function discoverFromURL($url, $htmlOk=true)
  98. {
  99. try {
  100. $client = new HTTPClient();
  101. $response = $client->get($url);
  102. } catch (HTTP_Request2_Exception $e) {
  103. common_log(LOG_ERR, __METHOD__ . " Failure for $url - " . $e->getMessage());
  104. throw new FeedSubBadURLException($e->getMessage());
  105. }
  106. if ($htmlOk) {
  107. $type = $response->getHeader('Content-Type');
  108. $isHtml = preg_match('!^(text/html|application/xhtml\+xml)!i', $type);
  109. if ($isHtml) {
  110. $target = $this->discoverFromHTML($response->getUrl(), $response->getBody());
  111. if (!$target) {
  112. throw new FeedSubNoFeedException($url);
  113. }
  114. return $this->discoverFromURL($target, false);
  115. }
  116. }
  117. return $this->initFromResponse($response);
  118. }
  119. function discoverFromFeedURL($url)
  120. {
  121. return $this->discoverFromURL($url, false);
  122. }
  123. function initFromResponse($response)
  124. {
  125. if (!$response->isOk()) {
  126. throw new FeedSubBadResponseException($response->getStatus());
  127. }
  128. $sourceurl = $response->getUrl();
  129. $body = $response->getBody();
  130. if (!$body) {
  131. throw new FeedSubEmptyException($sourceurl);
  132. }
  133. $type = $response->getHeader('Content-Type');
  134. if (preg_match('!^(text/xml|application/xml|application/(rss|atom)\+xml)!i', $type)) {
  135. return $this->init($sourceurl, $type, $body);
  136. } else {
  137. common_log(LOG_WARNING, "Unrecognized feed type $type for $sourceurl");
  138. throw new FeedSubUnrecognizedTypeException($type);
  139. }
  140. }
  141. function init($sourceurl, $type, $body)
  142. {
  143. $feed = new DOMDocument();
  144. if ($feed->loadXML($body)) {
  145. $this->uri = $sourceurl;
  146. $this->type = $type;
  147. $this->feed = $feed;
  148. $el = $this->feed->documentElement;
  149. // Looking for the "root" element: RSS channel or Atom feed
  150. if ($el->tagName == 'rss') {
  151. $channels = $el->getElementsByTagName('channel');
  152. if ($channels->length > 0) {
  153. $this->root = $channels->item(0);
  154. } else {
  155. throw new FeedSubBadXmlException($sourceurl);
  156. }
  157. } else if ($el->tagName == 'feed') {
  158. $this->root = $el;
  159. } else {
  160. throw new FeedSubBadXmlException($sourceurl);
  161. }
  162. return $this->uri;
  163. } else {
  164. throw new FeedSubBadXmlException($sourceurl);
  165. }
  166. }
  167. /**
  168. * @param string $url source URL, used to resolve relative links
  169. * @param string $body HTML body text
  170. * @return mixed string with URL or false if no target found
  171. */
  172. function discoverFromHTML($url, $body)
  173. {
  174. // DOMDocument::loadHTML may throw warnings on unrecognized elements,
  175. // and notices on unrecognized namespaces.
  176. $old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE));
  177. $dom = new DOMDocument();
  178. $ok = $dom->loadHTML($body);
  179. error_reporting($old);
  180. if (!$ok) {
  181. throw new FeedSubBadHtmlException();
  182. }
  183. // Autodiscovery links may be relative to the page's URL or <base href>
  184. $base = false;
  185. $nodes = $dom->getElementsByTagName('base');
  186. for ($i = 0; $i < $nodes->length; $i++) {
  187. $node = $nodes->item($i);
  188. if ($node->hasAttributes()) {
  189. $href = $node->attributes->getNamedItem('href');
  190. if ($href) {
  191. $base = trim($href->value);
  192. }
  193. }
  194. }
  195. if ($base) {
  196. $base = $this->resolveURI($base, $url);
  197. } else {
  198. $base = $url;
  199. }
  200. // Ok... now on to the links!
  201. // Types listed in order of priority -- we'll prefer Atom if available.
  202. // @fixme merge with the munger link checks
  203. $feeds = array(
  204. 'application/atom+xml' => false,
  205. 'application/rss+xml' => false,
  206. );
  207. $nodes = $dom->getElementsByTagName('link');
  208. for ($i = 0; $i < $nodes->length; $i++) {
  209. $node = $nodes->item($i);
  210. if ($node->hasAttributes()) {
  211. $rel = $node->attributes->getNamedItem('rel');
  212. $type = $node->attributes->getNamedItem('type');
  213. $href = $node->attributes->getNamedItem('href');
  214. if ($rel && $type && $href) {
  215. $rel = array_filter(explode(" ", $rel->value));
  216. $type = trim($type->value);
  217. $href = trim($href->value);
  218. if (in_array('alternate', $rel) && array_key_exists($type, $feeds) && empty($feeds[$type])) {
  219. // Save the first feed found of each type...
  220. $feeds[$type] = $this->resolveURI($href, $base);
  221. }
  222. }
  223. }
  224. }
  225. // Return the highest-priority feed found
  226. foreach ($feeds as $type => $url) {
  227. if ($url) {
  228. return $url;
  229. }
  230. }
  231. return false;
  232. }
  233. /**
  234. * Resolve a possibly relative URL against some absolute base URL
  235. * @param string $rel relative or absolute URL
  236. * @param string $base absolute URL
  237. * @return string absolute URL, or original URL if could not be resolved.
  238. */
  239. function resolveURI($rel, $base)
  240. {
  241. require_once "Net/URL2.php";
  242. try {
  243. $relUrl = new Net_URL2($rel);
  244. if ($relUrl->isAbsolute()) {
  245. return $rel;
  246. }
  247. $baseUrl = new Net_URL2($base);
  248. $absUrl = $baseUrl->resolve($relUrl);
  249. return $absUrl->getURL();
  250. } catch (Exception $e) {
  251. common_log(LOG_WARNING, 'Unable to resolve relative link "' .
  252. $rel . '" against base "' . $base . '": ' . $e->getMessage());
  253. return $rel;
  254. }
  255. }
  256. }