123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119 |
- <?php
- include("config.php");
- function crawl_page($url, $depth = 5, $filename)
- {
- static $seen = array();
- if (isset($seen[$url]) || $depth === 0) {
- return 3;
- }
- $seen[$url] = true;
- $dom = new DOMDocument('1.0');
- // Get source from URL, feed through loadHTMLFile
- $source = file_get_contents($url, false, stream_context_create(array('ssl' => array('verify_peer' => false, 'verify_peer_name' => false))));
- @$dom->loadHTML($source);
- // If error on pull, skip!
- if($source === FALSE) {
- return 1;
- }
- $crawlcount = "0";
- $anchors = $dom->getElementsByTagName('a');
- foreach ($anchors as $element) {
- $href = $element->getAttribute('href');
- if (strpos($href, 'http') !== 0) {
- $path = '/' . ltrim($href, '/');
- $parts = parse_url($url);
- $href = $parts['scheme'] . '://';
- if (isset($parts['user']) && isset($parts['pass'])) {
- $href .= $parts['user'] . ':' . $parts['pass'] . '@';
- }
- $href .= $parts['host'];
- if (isset($parts['port'])) {
- $href .= ':' . $parts['port'];
- }
- if (isset($parts['path'])) {
- $href .= dirname($parts['path'], 1).$path;
- } else {
- $href .= $path;
- }
- }
- $crawlcount++;
- if($crawlcount>$GLOBALS['maxpagecrawl']) { break; }
- crawl_page($href, $depth - 1, $filename);
- }
- $metas = $dom->getElementsByTagName('meta');
- for ($ii = 0; $ii < $metas->length; $ii++)
- {
- $meta = $metas->item($ii);
- if($meta->getAttribute('name') == 'description') {
- $description = $meta->getAttribute('content');
- }
- if($meta->getAttribute('name') == 'keywords') {
- $keywords = $meta->getAttribute('content');
- }
- }
- if(!isset($description)) { $description = "No description..."; }
- if(!isset($keywords)) { $keywords = "No keywords..."; }
- $title = $dom->getElementsByTagName('title');
- if ($title->length) {
- $title = $title->item(0)->nodeValue;
- if(trim($title)=="" || trim($description)=="") {
- return 2;
- }
- } else {
- return 2;
- }
- if(strpos(file_get_contents($filename), "URL: $url") === false) {
- echo "URL: " . $url . "<br />\n"
- . "Title: " . $title . "<br />\n"
- . "Description: " . $description . "<br />\n"
- . "Keywords: " . $keywords . "<br /><br />\n";
- file_put_contents($filename, "URL: " . $url . "\n"
- . "Title: " . $title . "\n"
- . "Description: " . $description . "\n"
- . "Keywords: " . $keywords . "\n\n", FILE_APPEND);
- } else {
- echo "$url exists in " . $filename . PHP_EOL;
- return 3;
- }
- }
- if(isset($argc)) {
- if($argc == "2") {
- $urlline = $argv[1];
- echo "One argument\r\n";
- if(filter_var(trim($urlline), FILTER_VALIDATE_URL) !== FALSE) {
- echo "Crawling $urlline" . PHP_EOL;
- crawl_page(trim($urlline), $crawl_depth, $GLOBALS['database']);
- }
- }
- if($argc == "3") {
- $urlline = $argv[1];
- $fileout = $argv[2];
- echo "Two arguments\r\n";
- if(filter_var(trim($urlline), FILTER_VALIDATE_URL) !== FALSE) {
- echo "Crawling $urlline and saving to $fileout" . PHP_EOL;
- crawl_page(trim($urlline), $crawl_depth, $fileout);
- }
- }
- }
|