index.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. <?php /* -*- mode: PHP; coding: utf-8; indent-tabs-mode: t; tab-width: 4 -*-
  2. vim: ts=4 noet ai */
  3. /**
  4. This file is part of OpenCrawler.
  5. OpenCrawler: Allows visitors to wander the Web
  6. Copyright © 2016 OpenCrawler Developers
  7. This program is free software: you can redistribute it and/or modify
  8. it under the terms of the GNU Affero General Public License as published by
  9. the Free Software Foundation, either version 3 of the License, or
  10. (at your option) any later version.
  11. This program is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. GNU Affero General Public License for more details.
  15. You should have received a copy of the GNU Affero General Public License
  16. along with this program. If not, see <https://www.gnu.org/licenses/>.
  17. @license AGPL-3+
  18. @file
  19. */
  20. namespace esperecyan\url\lib {
  21. function idn_to_ascii (... $args) {
  22. return \idn_to_ascii (... $args) ?: $args[0];
  23. }
  24. function idn_to_utf8 (... $args) {
  25. return \idn_to_utf8 (... $args) ?: $args[0];
  26. }
  27. }
  28. namespace IndexPHP\OpenCrawler {
  29. require_once __DIR__ . '/vendor/autoload.php';
  30. use esperecyan\url\URL;
  31. use RobotsTxtParser;
  32. use esperecyan\url\lib\HostProcessing;
  33. use esperecyan\url\lib\Infrastructure;
  34. class Exception extends \Exception
  35. {}
  36. class RobotsTxt
  37. {
  38. private $userAgents = [];
  39. public function __construct (... $userAgents)
  40. {
  41. $this->userAgents = $userAgents;
  42. }
  43. public function test ($rules, $path)
  44. {
  45. $parser = new RobotsTxtParser ($rules);
  46. foreach ($this->userAgents as $ua) {
  47. $parser->setUserAgent ($ua);
  48. if ($parser->isAllowed ($path)) {
  49. return true;
  50. }
  51. }
  52. return false;
  53. }
  54. }
  55. class CurlSimple
  56. {
  57. public $userAgent = 'Mozilla/5.0 (compatible; OpenCrawler/2016.12.01 +https://notabug.org/index-php/OpenCrawler) like Gecko';
  58. private $shared;
  59. private $max;
  60. private $callbacks = [];
  61. private $files = [];
  62. public function __construct ($max = 102400)
  63. {
  64. $this->shared = curl_share_init ();
  65. curl_share_setopt ($this->shared, CURLSHOPT_SHARE, CURL_LOCK_DATA_COOKIE);
  66. $this->max = $max;
  67. }
  68. public function performRequest ($location, $referrer, callable $callback)
  69. {
  70. $handle = curl_init ();
  71. curl_setopt ($handle, CURLOPT_SHARE, $this->shared);
  72. curl_setopt ($handle, CURLOPT_USERAGENT, (string) $this->userAgent);
  73. curl_setopt ($handle, CURLOPT_COOKIEFILE, '');
  74. curl_setopt ($handle, CURLOPT_AUTOREFERER, true);
  75. curl_setopt ($handle, CURLOPT_FOLLOWLOCATION, true);
  76. curl_setopt ($handle, CURLOPT_CONNECTTIMEOUT, 10);
  77. curl_setopt ($handle, CURLOPT_MAXREDIRS, 5);
  78. curl_setopt ($handle, CURLOPT_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);
  79. curl_setopt ($handle, CURLOPT_REDIR_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);
  80. curl_setopt ($handle, CURLOPT_TIMEOUT, 20);
  81. curl_setopt ($handle, CURLOPT_URL, $location);
  82. $referrer && curl_setopt ($handle, CURLOPT_REFERER, $referrer);
  83. curl_setopt ($handle, CURLOPT_BUFFERSIZE, 1024);
  84. curl_setopt ($handle, CURLOPT_NOPROGRESS, false);
  85. curl_setopt ($handle, CURLOPT_WRITEFUNCTION
  86. ,function ($expected, $downloaded)
  87. {
  88. return ($downloaded > $this->max) ? 1 : 0;
  89. });
  90. $fp = tmpfile ();
  91. if (!is_resource ($fp)) {
  92. throw new Exception;
  93. }
  94. curl_setopt ($handle, CURLOPT_FILE, $fp);
  95. curl_exec ($handle);
  96. //var_dump (curl_getinfo ($handle));
  97. $type = curl_getinfo ($handle, CURLINFO_CONTENT_TYPE);
  98. rewind ($fp);
  99. $data = stream_get_contents ($fp);
  100. fclose ($fp);
  101. curl_close ($handle);
  102. $callback ($data, $type);
  103. }
  104. }
  105. class CurlFactory
  106. {
  107. public $userAgent = 'Mozilla/5.0 (compatible; OpenCrawler/2016.12.01 +https://notabug.org/index-php/OpenCrawler) like Gecko';
  108. private $shared;
  109. private $max;
  110. private $handles = [];
  111. private $callbacks = [];
  112. private $files = [];
  113. public function __construct ($max = 102400)
  114. {
  115. $this->shared = curl_share_init ();
  116. curl_share_setopt ($this->shared, CURLSHOPT_SHARE, CURL_LOCK_DATA_COOKIE);
  117. $this->max = $max;
  118. }
  119. public function addRequest ($location, $referrer, callable $callback)
  120. {
  121. $handle = curl_init ();
  122. curl_setopt ($handle, CURLOPT_SHARE, $this->shared);
  123. curl_setopt ($handle, CURLOPT_USERAGENT, (string) $this->userAgent);
  124. curl_setopt ($handle, CURLOPT_COOKIEFILE, '');
  125. curl_setopt ($handle, CURLOPT_AUTOREFERER, true);
  126. curl_setopt ($handle, CURLOPT_FOLLOWLOCATION, true);
  127. curl_setopt ($handle, CURLOPT_CONNECTTIMEOUT, 10);
  128. curl_setopt ($handle, CURLOPT_MAXREDIRS, 5);
  129. curl_setopt ($handle, CURLOPT_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);
  130. curl_setopt ($handle, CURLOPT_REDIR_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);
  131. curl_setopt ($handle, CURLOPT_TIMEOUT, 20);
  132. curl_setopt ($handle, CURLOPT_URL, $location);
  133. $referrer && curl_setopt ($handle, CURLOPT_REFERER, $referrer);
  134. curl_setopt ($handle, CURLOPT_BUFFERSIZE, 1024);
  135. curl_setopt ($handle, CURLOPT_NOPROGRESS, false);
  136. curl_setopt ($handle, CURLOPT_WRITEFUNCTION
  137. ,function ($expected, $downloaded)
  138. {
  139. return ($downloaded > $this->max) ? 1 : 0;
  140. });
  141. $this->callbacks[(int) $handle] = $callback;
  142. $fp = tmpfile ();
  143. if (!is_resource ($fp)) {
  144. throw new Exception;
  145. }
  146. curl_setopt ($handle, CURLOPT_FILE, $fp);
  147. $this->files[(int) $handle] = $fp;
  148. $this->handles[] = $handle;
  149. }
  150. public function execute ()
  151. {
  152. $multi = curl_multi_init ();
  153. foreach ($this->handles as $handle) {
  154. curl_multi_add_handle ($multi, $handle);
  155. }
  156. do {
  157. $mrc = curl_multi_exec ($multi, $active);
  158. do {
  159. $info = curl_multi_info_read ($multi, $msgCount);
  160. if ($info && $info['msg'] === CURLMSG_DONE
  161. && is_resource ($info['handle'])) {
  162. $i = (int) $info['handle'];
  163. is_callable ($this->callbacks[$i])
  164. && $this->callbacks[$i] (stream_get_contents ($this->files[$i])
  165. , curl_getinfo ($info['handle'], CURLINFO_CONTENT_TYPE));
  166. unset ($this->callbacks[$i]);
  167. unset ($this->files[$i]);
  168. }
  169. } while ($msgCount > 1);
  170. } while ($mrc == CURLM_CALL_MULTI_PERFORM);
  171. echo PHP_EOL, '1';
  172. while ($active && $mrc == CURLM_OK) {
  173. echo '.';
  174. usleep (10);
  175. if (curl_multi_select ($multi) != -1) {
  176. echo PHP_EOL, '2';
  177. do {
  178. $info = curl_multi_info_read ($multi, $msgCount);
  179. if ($info && $info['msg'] === CURLMSG_DONE
  180. && is_resource ($info['handle'])) {
  181. $i = (int) $info['handle'];
  182. is_callable ($this->callbacks[$i])
  183. && $this->callbacks[$i] (stream_get_contents ($this->files[$i])
  184. , curl_getinfo ($info['handle'], CURLINFO_CONTENT_TYPE));
  185. unset ($this->callbacks[$i]);
  186. unset ($this->files[$i]);
  187. }
  188. } while ($msgCount > 1);
  189. echo PHP_EOL, '3';
  190. do {
  191. $mrc = curl_multi_exec ($multi, $active);
  192. do {
  193. $info = curl_multi_info_read ($multi, $msgCount);
  194. if ($info && $info['msg'] === CURLMSG_DONE
  195. && is_resource ($info['handle'])) {
  196. $i = (int) $info['handle'];
  197. is_callable ($this->callbacks[$i])
  198. && $this->callbacks[$i] (stream_get_contents ($this->files[$i])
  199. , curl_getinfo ($info['handle'], CURLINFO_CONTENT_TYPE));
  200. unset ($this->callbacks[$i]);
  201. unset ($this->files[$i]);
  202. }
  203. } while ($msgCount > 1);
  204. } while ($mrc == CURLM_CALL_MULTI_PERFORM);
  205. }
  206. }
  207. do {
  208. $info = curl_multi_info_read ($multi, $msgCount);
  209. if ($info && $info['msg'] === CURLMSG_DONE
  210. && is_resource ($info['handle'])) {
  211. $i = (int) $info['handle'];
  212. is_callable ($this->callbacks[$i])
  213. && $this->callbacks[$i] (stream_get_contents ($this->files[$i])
  214. , curl_getinfo ($info['handle'], CURLINFO_CONTENT_TYPE));
  215. unset ($this->callbacks[$i]);
  216. unset ($this->files[$i]);
  217. }
  218. } while ($msgCount > 1);
  219. foreach ($this->handles as $handle) {
  220. curl_multi_remove_handle ($multi, $handle);
  221. unset ($this->callbacks[(int) $handle]);
  222. unset ($this->files[(int) $handle]);
  223. }
  224. $this->handles = [];
  225. curl_multi_close ($multi);
  226. }
  227. }
  228. class Crawler
  229. {
  230. private $db;
  231. private $url;
  232. private $robots;
  233. public function __construct (Database $db, $location, $referrer)
  234. {
  235. $this->db = $db;
  236. $this->robots = new RobotsTxt ('OpenCrawler', 'Googlebot');
  237. $url = new URL ($location, $referrer);
  238. $path = $url->pathname;
  239. $url->pathname = '/robots.txt';
  240. $curl = new CurlSimple;
  241. $curl->performRequest ((string) $url, $referrer
  242. , function ($data, $type) use ($location, $path, $curl)
  243. {
  244. if ('text/plain' === trim (explode (';', $type)[0])) {
  245. if (!$this->robots->test ($data, $path)) {
  246. return;
  247. }
  248. }
  249. $curl->performRequest ($location, null
  250. , function ($data, $type) use ($location, $path)
  251. {
  252. if ('text/html' !== trim (explode (';', $type)[0])) {
  253. return;
  254. }
  255. $doc = new \DOMDocument;
  256. @$doc->loadHTML ($data);
  257. foreach ($doc->getElementsByTagName ('a') as $link) {
  258. if (!$link->hasAttribute ('href')) continue;
  259. try {
  260. $url = new URL ($link->getAttribute ('href'), $location);
  261. if ($url->protocol !== 'https:' && $url->protocol !== 'http:') continue;
  262. $url->hash = '';
  263. $this->db->add ($location, $url);
  264. } catch (\Exception $e) {}
  265. }
  266. });
  267. });
  268. }
  269. }
  270. class Database
  271. {
  272. private $pdo;
  273. public function __construct ($path)
  274. {
  275. $this->pdo = new \PDO ("sqlite:$path");
  276. $this->pdo->beginTransaction ();
  277. $this->pdo->exec (<<<'SQL'
  278. CREATE TABLE IF NOT EXISTS `uri` (
  279. hash BLOB PRIMARY KEY,
  280. uri BLOB,
  281. referrer BLOB,
  282. origin BLOB,
  283. count INT DEFAULT 0
  284. );
  285. SQL
  286. );
  287. }
  288. public function get ()
  289. {
  290. foreach ($this->pdo->query ('SELECT uri, referrer, count FROM `uri` GROUP BY origin ORDER BY count ASC, hash ASC LIMIT 1;')
  291. as $row) {
  292. $s = $this->pdo->prepare ('UPDATE `uri` SET count = ? WHERE uri = ?;');
  293. $s->execute ([$row['count'] + 1, $row['uri']]);
  294. return $row;
  295. }
  296. return null;
  297. }
  298. public function dumpInfo ()
  299. {
  300. foreach ($this->pdo->query ('SELECT hash, uri, referrer, count FROM `uri` GROUP BY origin ORDER BY count ASC, hash ASC LIMIT 500;')
  301. as $row) {
  302. yield [$row['count'], bin2hex ($row['hash']), $row['uri'], $row['referrer']];
  303. }
  304. }
  305. public function add ($referrer, ... $uris)
  306. {
  307. $s = $this->pdo->prepare ('INSERT OR IGNORE INTO `uri` (hash, uri, referrer, origin) VALUES (?, ?, ?, ?);');
  308. foreach ($uris as $uri) {
  309. $origin = (new URL ($uri))->origin;
  310. $s->execute ([hash ('sha256', $uri, true), $uri, $referrer, $origin]);
  311. }
  312. }
  313. public function __destruct ()
  314. {
  315. $this->pdo->commit ();
  316. }
  317. }
  318. $db = new Database (__DIR__ . '/db.sqlite');
  319. if (isset ($_GET['add'])) {
  320. $db->add ('https://www.google.com/', $_GET['add']);
  321. }
  322. if (isset ($_GET['phpinfo'])) {
  323. phpinfo ();
  324. exit;
  325. }
  326. if (isset ($_GET['dumpinfo'])) {
  327. echo '<!doctype html><meta charset="utf-8"/><title>Data</title>';
  328. echo '<style>table {
  329. width: 100% !important;
  330. max-width: 100%;
  331. border-spacing: 0;
  332. border-collapse: collapse;
  333. table-layout: fixed;
  334. }
  335. td {
  336. font-family: monospace;
  337. white-space: pre;
  338. overflow: hidden;
  339. border: solid 1px #ccc;
  340. padding: 0.2em 0.5em;
  341. box-sizing: border-box
  342. }
  343. tr > :nth-of-type(1){
  344. width: 5% !important;
  345. }
  346. tr > :nth-of-type(2){
  347. width: 10% !important;
  348. }</style>';
  349. echo '<table><thead><tr><th>Count</th><th>Hash</th><th>URI</th><th>Referrer</th></tr></thead><tbody>';
  350. foreach ($db->dumpInfo () as $info) {
  351. echo '<tr>';
  352. foreach ($info as $x) {
  353. echo '<td>', htmlspecialchars ($x, ENT_HTML5 | ENT_DISALLOWED, 'UTF-8'), '</td>';
  354. }
  355. echo '</tr>';
  356. }
  357. echo '</tbody></table>';
  358. exit;
  359. }
  360. do {
  361. $rec = $db->get ();
  362. } while ($rec && $rec['uri'] == $rec['referrer']);
  363. if (!$rec) {
  364. header ('HTTP/1.1 404 Not Found');
  365. echo 'Not found';
  366. exit;
  367. }
  368. try {
  369. $c = new Crawler ($db, $rec['uri'], $rec['referrer']);
  370. } catch (\Exception $e) {
  371. header ('HTTP/1.1 503 Temporary Unavailable');
  372. header ('Content-Type: text/plain; charset=UTF-8');
  373. header ('X-Content-Type-Options: nosniff');
  374. echo $e, PHP_EOL;
  375. exit;
  376. }
  377. header ('HTTP/1.1 303 See Other');
  378. header ("Location: {$rec['uri']}");
  379. $escaped = htmlspecialchars ($rec['uri'], ENT_HTML5 | ENT_DISALLOWED, 'UTF-8');
  380. ?>
  381. <!doctype html>
  382. <meta charset='utf-8'/>
  383. <meta http-equiv="refresh" content="5; url=<?= $escaped ?>"/>
  384. <title>Redirecting...</title>
  385. Redirecting to <a href="<?= $escaped ?>"><?= $escaped ?></a>
  386. <?php
  387. }