feeddiscovery.php 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. <?php
  2. /*
  3. * StatusNet - the distributed open-source microblogging tool
  4. * Copyright (C) 2009, StatusNet, Inc.
  5. *
  6. * This program is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU Affero General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU Affero General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Affero General Public License
  17. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. /**
  20. * @package FeedSubPlugin
  21. * @maintainer Brion Vibber <brion@status.net>
  22. */
  23. if (!defined('STATUSNET')) {
  24. exit(1);
  25. }
  26. class FeedSubBadURLException extends FeedSubException
  27. {
  28. }
  29. class FeedSubBadResponseException extends FeedSubException
  30. {
  31. }
  32. class FeedSubEmptyException extends FeedSubException
  33. {
  34. }
  35. class FeedSubBadHTMLException extends FeedSubException
  36. {
  37. }
  38. class FeedSubUnrecognizedTypeException extends FeedSubException
  39. {
  40. }
  41. class FeedSubNoFeedException extends FeedSubException
  42. {
  43. }
  44. class FeedSubNoSalmonException extends FeedSubException
  45. {
  46. }
  47. class FeedSubBadXmlException extends FeedSubException
  48. {
  49. }
  50. class FeedSubNoHubException extends FeedSubException
  51. {
  52. }
  53. /**
  54. * Given a web page or feed URL, discover the final location of the feed
  55. * and return its current contents.
  56. *
  57. * @example
  58. * $feed = new FeedDiscovery();
  59. * if ($feed->discoverFromURL($url)) {
  60. * print $feed->uri;
  61. * print $feed->type;
  62. * processFeed($feed->feed); // DOMDocument
  63. * }
  64. */
  65. class FeedDiscovery
  66. {
  67. public $uri;
  68. public $type;
  69. public $feed;
  70. public $root;
  71. /** Post-initialize query helper... */
  72. public function getLink($rel, $type=null)
  73. {
  74. // @fixme check for non-Atom links in RSS2 feeds as well
  75. return self::getAtomLink($rel, $type);
  76. }
  77. public function getAtomLink($rel, $type=null)
  78. {
  79. return ActivityUtils::getLink($this->root, $rel, $type);
  80. }
  81. /**
  82. * Get the referenced WebSub hub link from an Atom feed.
  83. *
  84. * @return mixed string or false
  85. */
  86. public function getHubLink()
  87. {
  88. return $this->getAtomLink('hub');
  89. }
  90. /**
  91. * @param string $url
  92. * @param bool $htmlOk pass false here if you don't want to follow web pages.
  93. * @return string with validated URL
  94. * @throws FeedSubBadURLException
  95. * @throws FeedSubBadHtmlException
  96. * @throws FeedSubNoFeedException
  97. * @throws FeedSubEmptyException
  98. * @throws FeedSubUnrecognizedTypeException
  99. */
  100. function discoverFromURL($url, $htmlOk=true)
  101. {
  102. try {
  103. $client = new HTTPClient();
  104. $response = $client->get($url);
  105. } catch (Exception $e) {
  106. common_log(LOG_ERR, __METHOD__ . " Failure for $url - " . $e->getMessage());
  107. throw new FeedSubBadURLException($e->getMessage());
  108. }
  109. if ($htmlOk) {
  110. $type = $response->getHeader('Content-Type');
  111. $isHtml = preg_match('!^(text/html|application/xhtml\+xml)!i', $type);
  112. if ($isHtml) {
  113. $target = $this->discoverFromHTML($response->getEffectiveUrl(), $response->getBody());
  114. if (!$target) {
  115. throw new FeedSubNoFeedException($url);
  116. }
  117. return $this->discoverFromURL($target, false);
  118. }
  119. }
  120. return $this->initFromResponse($response);
  121. }
  122. function discoverFromFeedURL($url)
  123. {
  124. return $this->discoverFromURL($url, false);
  125. }
  126. function initFromResponse($response)
  127. {
  128. if (!$response->isOk()) {
  129. throw new FeedSubBadResponseException($response->getStatus());
  130. }
  131. $sourceurl = $response->getEffectiveUrl();
  132. $body = $response->getBody();
  133. if (!$body) {
  134. throw new FeedSubEmptyException($sourceurl);
  135. }
  136. $type = $response->getHeader('Content-Type');
  137. if (preg_match('!^(text/xml|application/xml|application/(rss|atom)\+xml)!i', $type)) {
  138. return $this->init($sourceurl, $type, $body);
  139. } else {
  140. common_log(LOG_WARNING, "Unrecognized feed type $type for $sourceurl");
  141. throw new FeedSubUnrecognizedTypeException($type);
  142. }
  143. }
  144. function init($sourceurl, $type, $body)
  145. {
  146. $feed = new DOMDocument();
  147. if ($feed->loadXML($body)) {
  148. $this->uri = $sourceurl;
  149. $this->type = $type;
  150. $this->feed = $feed;
  151. $el = $this->feed->documentElement;
  152. // Looking for the "root" element: RSS channel or Atom feed
  153. if ($el->tagName == 'rss') {
  154. $channels = $el->getElementsByTagName('channel');
  155. if ($channels->length > 0) {
  156. $this->root = $channels->item(0);
  157. } else {
  158. throw new FeedSubBadXmlException($sourceurl);
  159. }
  160. } else if ($el->tagName == 'feed') {
  161. $this->root = $el;
  162. } else {
  163. throw new FeedSubBadXmlException($sourceurl);
  164. }
  165. return $this->uri;
  166. } else {
  167. throw new FeedSubBadXmlException($sourceurl);
  168. }
  169. }
  170. /**
  171. * @param string $url source URL, used to resolve relative links
  172. * @param string $body HTML body text
  173. * @return mixed string with URL or false if no target found
  174. */
  175. function discoverFromHTML($url, $body)
  176. {
  177. // DOMDocument::loadHTML may throw warnings on unrecognized elements,
  178. // and notices on unrecognized namespaces.
  179. $old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE));
  180. $dom = new DOMDocument();
  181. $ok = $dom->loadHTML($body);
  182. error_reporting($old);
  183. if (!$ok) {
  184. throw new FeedSubBadHtmlException();
  185. }
  186. // Autodiscovery links may be relative to the page's URL or <base href>
  187. $base = false;
  188. $nodes = $dom->getElementsByTagName('base');
  189. for ($i = 0; $i < $nodes->length; $i++) {
  190. $node = $nodes->item($i);
  191. if ($node->hasAttributes()) {
  192. $href = $node->attributes->getNamedItem('href');
  193. if ($href) {
  194. $base = trim($href->value);
  195. }
  196. }
  197. }
  198. if ($base) {
  199. $base = $this->resolveURI($base, $url);
  200. } else {
  201. $base = $url;
  202. }
  203. // Ok... now on to the links!
  204. // Types listed in order of priority -- we'll prefer Atom if available.
  205. // @fixme merge with the munger link checks
  206. $feeds = array(
  207. 'application/atom+xml' => false,
  208. 'application/rss+xml' => false,
  209. );
  210. $nodes = $dom->getElementsByTagName('link');
  211. for ($i = 0; $i < $nodes->length; $i++) {
  212. $node = $nodes->item($i);
  213. if ($node->hasAttributes()) {
  214. $rel = $node->attributes->getNamedItem('rel');
  215. $type = $node->attributes->getNamedItem('type');
  216. $href = $node->attributes->getNamedItem('href');
  217. if ($rel && $type && $href) {
  218. $rel = array_filter(explode(" ", $rel->value));
  219. $type = trim($type->value);
  220. $href = trim($href->value);
  221. if (in_array('alternate', $rel) && array_key_exists($type, $feeds) && empty($feeds[$type])) {
  222. // Save the first feed found of each type...
  223. $feeds[$type] = $this->resolveURI($href, $base);
  224. }
  225. }
  226. }
  227. }
  228. // Return the highest-priority feed found
  229. foreach ($feeds as $type => $url) {
  230. if ($url) {
  231. return $url;
  232. }
  233. }
  234. return false;
  235. }
  236. /**
  237. * Resolve a possibly relative URL against some absolute base URL
  238. * @param string $rel relative or absolute URL
  239. * @param string $base absolute URL
  240. * @return string absolute URL, or original URL if could not be resolved.
  241. */
  242. function resolveURI($rel, $base)
  243. {
  244. require_once "Net/URL2.php";
  245. try {
  246. $relUrl = new Net_URL2($rel);
  247. if ($relUrl->isAbsolute()) {
  248. return $rel;
  249. }
  250. $baseUrl = new Net_URL2($base);
  251. $absUrl = $baseUrl->resolve($relUrl);
  252. return $absUrl->getURL();
  253. } catch (Exception $e) {
  254. common_log(LOG_WARNING, 'Unable to resolve relative link "' .
  255. $rel . '" against base "' . $base . '": ' . $e->getMessage());
  256. return $rel;
  257. }
  258. }
  259. }