123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451 |
- <?php /* -*- mode: PHP; coding: utf-8; indent-tabs-mode: t; tab-width: 4 -*-
- vim: ts=4 noet ai */
- /**
- This file is part of OpenCrawler.
- OpenCrawler: Allows visitors to wander the Web
- Copyright © 2016 OpenCrawler Developers
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <https://www.gnu.org/licenses/>.
- @license AGPL-3+
- @file
- */
- namespace esperecyan\url\lib {
- function idn_to_ascii (... $args) {
- return \idn_to_ascii (... $args) ?: $args[0];
- }
- function idn_to_utf8 (... $args) {
- return \idn_to_utf8 (... $args) ?: $args[0];
- }
- }
- namespace IndexPHP\OpenCrawler {
- require_once __DIR__ . '/vendor/autoload.php';
- use esperecyan\url\URL;
- use RobotsTxtParser;
- use esperecyan\url\lib\HostProcessing;
- use esperecyan\url\lib\Infrastructure;
- class Exception extends \Exception
- {}
- class RobotsTxt
- {
- private $userAgents = [];
-
- public function __construct (... $userAgents)
- {
- $this->userAgents = $userAgents;
- }
-
- public function test ($rules, $path)
- {
- $parser = new RobotsTxtParser ($rules);
- foreach ($this->userAgents as $ua) {
- $parser->setUserAgent ($ua);
- if ($parser->isAllowed ($path)) {
- return true;
- }
- }
- return false;
- }
- }
- class CurlSimple
- {
- public $userAgent = 'Mozilla/5.0 (compatible; OpenCrawler/2016.12.01 +https://notabug.org/index-php/OpenCrawler) like Gecko';
-
- private $shared;
- private $max;
- private $callbacks = [];
- private $files = [];
-
- public function __construct ($max = 102400)
- {
- $this->shared = curl_share_init ();
- curl_share_setopt ($this->shared, CURLSHOPT_SHARE, CURL_LOCK_DATA_COOKIE);
- $this->max = $max;
- }
-
- public function performRequest ($location, $referrer, callable $callback)
- {
- $handle = curl_init ();
- curl_setopt ($handle, CURLOPT_SHARE, $this->shared);
- curl_setopt ($handle, CURLOPT_USERAGENT, (string) $this->userAgent);
- curl_setopt ($handle, CURLOPT_COOKIEFILE, '');
- curl_setopt ($handle, CURLOPT_AUTOREFERER, true);
- curl_setopt ($handle, CURLOPT_FOLLOWLOCATION, true);
- curl_setopt ($handle, CURLOPT_CONNECTTIMEOUT, 10);
- curl_setopt ($handle, CURLOPT_MAXREDIRS, 5);
- curl_setopt ($handle, CURLOPT_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);
- curl_setopt ($handle, CURLOPT_REDIR_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);
- curl_setopt ($handle, CURLOPT_TIMEOUT, 20);
-
- curl_setopt ($handle, CURLOPT_URL, $location);
- $referrer && curl_setopt ($handle, CURLOPT_REFERER, $referrer);
-
- curl_setopt ($handle, CURLOPT_BUFFERSIZE, 1024);
- curl_setopt ($handle, CURLOPT_NOPROGRESS, false);
- curl_setopt ($handle, CURLOPT_WRITEFUNCTION
- ,function ($expected, $downloaded)
- {
- return ($downloaded > $this->max) ? 1 : 0;
- });
-
- $fp = tmpfile ();
- if (!is_resource ($fp)) {
- throw new Exception;
- }
-
- curl_setopt ($handle, CURLOPT_FILE, $fp);
- curl_exec ($handle);
- //var_dump (curl_getinfo ($handle));
- $type = curl_getinfo ($handle, CURLINFO_CONTENT_TYPE);
- rewind ($fp);
- $data = stream_get_contents ($fp);
- fclose ($fp);
- curl_close ($handle);
- $callback ($data, $type);
- }
- }
- class CurlFactory
- {
- public $userAgent = 'Mozilla/5.0 (compatible; OpenCrawler/2016.12.01 +https://notabug.org/index-php/OpenCrawler) like Gecko';
-
- private $shared;
- private $max;
- private $handles = [];
- private $callbacks = [];
- private $files = [];
-
- public function __construct ($max = 102400)
- {
- $this->shared = curl_share_init ();
- curl_share_setopt ($this->shared, CURLSHOPT_SHARE, CURL_LOCK_DATA_COOKIE);
- $this->max = $max;
- }
-
- public function addRequest ($location, $referrer, callable $callback)
- {
- $handle = curl_init ();
- curl_setopt ($handle, CURLOPT_SHARE, $this->shared);
- curl_setopt ($handle, CURLOPT_USERAGENT, (string) $this->userAgent);
- curl_setopt ($handle, CURLOPT_COOKIEFILE, '');
- curl_setopt ($handle, CURLOPT_AUTOREFERER, true);
- curl_setopt ($handle, CURLOPT_FOLLOWLOCATION, true);
- curl_setopt ($handle, CURLOPT_CONNECTTIMEOUT, 10);
- curl_setopt ($handle, CURLOPT_MAXREDIRS, 5);
- curl_setopt ($handle, CURLOPT_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);
- curl_setopt ($handle, CURLOPT_REDIR_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);
- curl_setopt ($handle, CURLOPT_TIMEOUT, 20);
-
- curl_setopt ($handle, CURLOPT_URL, $location);
- $referrer && curl_setopt ($handle, CURLOPT_REFERER, $referrer);
-
- curl_setopt ($handle, CURLOPT_BUFFERSIZE, 1024);
- curl_setopt ($handle, CURLOPT_NOPROGRESS, false);
- curl_setopt ($handle, CURLOPT_WRITEFUNCTION
- ,function ($expected, $downloaded)
- {
- return ($downloaded > $this->max) ? 1 : 0;
- });
-
- $this->callbacks[(int) $handle] = $callback;
-
- $fp = tmpfile ();
- if (!is_resource ($fp)) {
- throw new Exception;
- }
-
- curl_setopt ($handle, CURLOPT_FILE, $fp);
- $this->files[(int) $handle] = $fp;
- $this->handles[] = $handle;
- }
-
- public function execute ()
- {
- $multi = curl_multi_init ();
- foreach ($this->handles as $handle) {
- curl_multi_add_handle ($multi, $handle);
- }
-
- do {
- $mrc = curl_multi_exec ($multi, $active);
- do {
- $info = curl_multi_info_read ($multi, $msgCount);
- if ($info && $info['msg'] === CURLMSG_DONE
- && is_resource ($info['handle'])) {
- $i = (int) $info['handle'];
- is_callable ($this->callbacks[$i])
- && $this->callbacks[$i] (stream_get_contents ($this->files[$i])
- , curl_getinfo ($info['handle'], CURLINFO_CONTENT_TYPE));
- unset ($this->callbacks[$i]);
- unset ($this->files[$i]);
- }
- } while ($msgCount > 1);
- } while ($mrc == CURLM_CALL_MULTI_PERFORM);
-
- echo PHP_EOL, '1';
- while ($active && $mrc == CURLM_OK) {
- echo '.';
- usleep (10);
- if (curl_multi_select ($multi) != -1) {
- echo PHP_EOL, '2';
- do {
- $info = curl_multi_info_read ($multi, $msgCount);
- if ($info && $info['msg'] === CURLMSG_DONE
- && is_resource ($info['handle'])) {
- $i = (int) $info['handle'];
- is_callable ($this->callbacks[$i])
- && $this->callbacks[$i] (stream_get_contents ($this->files[$i])
- , curl_getinfo ($info['handle'], CURLINFO_CONTENT_TYPE));
- unset ($this->callbacks[$i]);
- unset ($this->files[$i]);
- }
- } while ($msgCount > 1);
-
- echo PHP_EOL, '3';
-
- do {
- $mrc = curl_multi_exec ($multi, $active);
-
- do {
- $info = curl_multi_info_read ($multi, $msgCount);
- if ($info && $info['msg'] === CURLMSG_DONE
- && is_resource ($info['handle'])) {
- $i = (int) $info['handle'];
- is_callable ($this->callbacks[$i])
- && $this->callbacks[$i] (stream_get_contents ($this->files[$i])
- , curl_getinfo ($info['handle'], CURLINFO_CONTENT_TYPE));
- unset ($this->callbacks[$i]);
- unset ($this->files[$i]);
- }
- } while ($msgCount > 1);
- } while ($mrc == CURLM_CALL_MULTI_PERFORM);
- }
- }
-
- do {
- $info = curl_multi_info_read ($multi, $msgCount);
- if ($info && $info['msg'] === CURLMSG_DONE
- && is_resource ($info['handle'])) {
- $i = (int) $info['handle'];
- is_callable ($this->callbacks[$i])
- && $this->callbacks[$i] (stream_get_contents ($this->files[$i])
- , curl_getinfo ($info['handle'], CURLINFO_CONTENT_TYPE));
- unset ($this->callbacks[$i]);
- unset ($this->files[$i]);
- }
- } while ($msgCount > 1);
-
- foreach ($this->handles as $handle) {
- curl_multi_remove_handle ($multi, $handle);
- unset ($this->callbacks[(int) $handle]);
- unset ($this->files[(int) $handle]);
- }
- $this->handles = [];
-
- curl_multi_close ($multi);
- }
- }
- class Crawler
- {
- private $db;
- private $url;
- private $robots;
-
- public function __construct (Database $db, $location, $referrer)
- {
- $this->db = $db;
- $this->robots = new RobotsTxt ('OpenCrawler', 'Googlebot');
- $url = new URL ($location, $referrer);
- $path = $url->pathname;
- $url->pathname = '/robots.txt';
-
- $curl = new CurlSimple;
- $curl->performRequest ((string) $url, $referrer
- , function ($data, $type) use ($location, $path, $curl)
- {
- if ('text/plain' === trim (explode (';', $type)[0])) {
- if (!$this->robots->test ($data, $path)) {
- return;
- }
- }
-
- $curl->performRequest ($location, null
- , function ($data, $type) use ($location, $path)
- {
- if ('text/html' !== trim (explode (';', $type)[0])) {
- return;
- }
- $doc = new \DOMDocument;
- @$doc->loadHTML ($data);
- foreach ($doc->getElementsByTagName ('a') as $link) {
- if (!$link->hasAttribute ('href')) continue;
- try {
- $url = new URL ($link->getAttribute ('href'), $location);
- if ($url->protocol !== 'https:' && $url->protocol !== 'http:') continue;
- $url->hash = '';
- $this->db->add ($location, $url);
- } catch (\Exception $e) {}
- }
- });
-
- });
-
- }
- }
- class Database
- {
- private $pdo;
-
- public function __construct ($path)
- {
- $this->pdo = new \PDO ("sqlite:$path");
- $this->pdo->beginTransaction ();
- $this->pdo->exec (<<<'SQL'
- CREATE TABLE IF NOT EXISTS `uri` (
- hash BLOB PRIMARY KEY,
- uri BLOB,
- referrer BLOB,
- origin BLOB,
- count INT DEFAULT 0
- );
- SQL
- );
- }
-
- public function get ()
- {
- foreach ($this->pdo->query ('SELECT uri, referrer, count FROM `uri` GROUP BY origin ORDER BY count ASC, hash ASC LIMIT 1;')
- as $row) {
- $s = $this->pdo->prepare ('UPDATE `uri` SET count = ? WHERE uri = ?;');
- $s->execute ([$row['count'] + 1, $row['uri']]);
- return $row;
- }
- return null;
- }
-
- public function dumpInfo ()
- {
- foreach ($this->pdo->query ('SELECT hash, uri, referrer, count FROM `uri` GROUP BY origin ORDER BY count ASC, hash ASC LIMIT 500;')
- as $row) {
- yield [$row['count'], bin2hex ($row['hash']), $row['uri'], $row['referrer']];
- }
- }
-
- public function add ($referrer, ... $uris)
- {
- $s = $this->pdo->prepare ('INSERT OR IGNORE INTO `uri` (hash, uri, referrer, origin) VALUES (?, ?, ?, ?);');
- foreach ($uris as $uri) {
- $origin = (new URL ($uri))->origin;
- $s->execute ([hash ('sha256', $uri, true), $uri, $referrer, $origin]);
- }
- }
-
- public function __destruct ()
- {
- $this->pdo->commit ();
- }
- }
- $db = new Database (__DIR__ . '/db.sqlite');
- if (isset ($_GET['add'])) {
- $db->add ('https://www.google.com/', $_GET['add']);
- }
- if (isset ($_GET['phpinfo'])) {
- phpinfo ();
- exit;
- }
- if (isset ($_GET['dumpinfo'])) {
- echo '<!doctype html><meta charset="utf-8"/><title>Data</title>';
- echo '<style>table {
- width: 100% !important;
- max-width: 100%;
- border-spacing: 0;
- border-collapse: collapse;
- table-layout: fixed;
- }
- td {
- font-family: monospace;
- white-space: pre;
- overflow: hidden;
- border: solid 1px #ccc;
- padding: 0.2em 0.5em;
- box-sizing: border-box
- }
- tr > :nth-of-type(1){
- width: 5% !important;
- }
- tr > :nth-of-type(2){
- width: 10% !important;
- }</style>';
- echo '<table><thead><tr><th>Count</th><th>Hash</th><th>URI</th><th>Referrer</th></tr></thead><tbody>';
- foreach ($db->dumpInfo () as $info) {
- echo '<tr>';
- foreach ($info as $x) {
- echo '<td>', htmlspecialchars ($x, ENT_HTML5 | ENT_DISALLOWED, 'UTF-8'), '</td>';
- }
- echo '</tr>';
- }
- echo '</tbody></table>';
- exit;
- }
- do {
- $rec = $db->get ();
- } while ($rec && $rec['uri'] == $rec['referrer']);
- if (!$rec) {
- header ('HTTP/1.1 404 Not Found');
- echo 'Not found';
- exit;
- }
- try {
- $c = new Crawler ($db, $rec['uri'], $rec['referrer']);
- } catch (\Exception $e) {
- header ('HTTP/1.1 503 Temporary Unavailable');
- header ('Content-Type: text/plain; charset=UTF-8');
- header ('X-Content-Type-Options: nosniff');
- echo $e, PHP_EOL;
-
- exit;
- }
- header ('HTTP/1.1 303 See Other');
- header ("Location: {$rec['uri']}");
- $escaped = htmlspecialchars ($rec['uri'], ENT_HTML5 | ENT_DISALLOWED, 'UTF-8');
- ?>
- <!doctype html>
- <meta charset='utf-8'/>
- <meta http-equiv="refresh" content="5; url=<?= $escaped ?>"/>
- <title>Redirecting...</title>
- Redirecting to <a href="<?= $escaped ?>"><?= $escaped ?></a>
- <?php
- }
|