crawl.php 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. <?php
  2. include("config.php");
  3. function crawl_page($url, $depth = 5, $filename)
  4. {
  5. static $seen = array();
  6. if (isset($seen[$url]) || $depth === 0) {
  7. return 3;
  8. }
  9. $seen[$url] = true;
  10. $dom = new DOMDocument('1.0');
  11. // Get source from URL, feed through loadHTMLFile
  12. $source = file_get_contents($url, false, stream_context_create(array('ssl' => array('verify_peer' => false, 'verify_peer_name' => false))));
  13. @$dom->loadHTML($source);
  14. // If error on pull, skip!
  15. if($source === FALSE) {
  16. return 1;
  17. }
  18. $crawlcount = "0";
  19. $anchors = $dom->getElementsByTagName('a');
  20. foreach ($anchors as $element) {
  21. $href = $element->getAttribute('href');
  22. if (strpos($href, 'http') !== 0) {
  23. $path = '/' . ltrim($href, '/');
  24. $parts = parse_url($url);
  25. $href = $parts['scheme'] . '://';
  26. if (isset($parts['user']) && isset($parts['pass'])) {
  27. $href .= $parts['user'] . ':' . $parts['pass'] . '@';
  28. }
  29. $href .= $parts['host'];
  30. if (isset($parts['port'])) {
  31. $href .= ':' . $parts['port'];
  32. }
  33. if (isset($parts['path'])) {
  34. $href .= dirname($parts['path'], 1).$path;
  35. } else {
  36. $href .= $path;
  37. }
  38. }
  39. $crawlcount++;
  40. if($crawlcount>$GLOBALS['maxpagecrawl']) { break; }
  41. crawl_page($href, $depth - 1, $filename);
  42. }
  43. $metas = $dom->getElementsByTagName('meta');
  44. for ($ii = 0; $ii < $metas->length; $ii++)
  45. {
  46. $meta = $metas->item($ii);
  47. if($meta->getAttribute('name') == 'description') {
  48. $description = $meta->getAttribute('content');
  49. }
  50. if($meta->getAttribute('name') == 'keywords') {
  51. $keywords = $meta->getAttribute('content');
  52. }
  53. }
  54. if(!isset($description)) { $description = "No description..."; }
  55. if(!isset($keywords)) { $keywords = "No keywords..."; }
  56. $title = $dom->getElementsByTagName('title');
  57. if ($title->length) {
  58. $title = $title->item(0)->nodeValue;
  59. if(trim($title)=="" || trim($description)=="") {
  60. return 2;
  61. }
  62. } else {
  63. return 2;
  64. }
  65. if(strpos(file_get_contents($filename), "URL: $url") === false) {
  66. echo "URL: " . $url . "<br />\n"
  67. . "Title: " . $title . "<br />\n"
  68. . "Description: " . $description . "<br />\n"
  69. . "Keywords: " . $keywords . "<br /><br />\n";
  70. file_put_contents($filename, "URL: " . $url . "\n"
  71. . "Title: " . $title . "\n"
  72. . "Description: " . $description . "\n"
  73. . "Keywords: " . $keywords . "\n\n", FILE_APPEND);
  74. } else {
  75. echo "$url exists in " . $filename . PHP_EOL;
  76. return 3;
  77. }
  78. }
  79. if(isset($argc)) {
  80. if($argc == "2") {
  81. $urlline = $argv[1];
  82. echo "One argument\r\n";
  83. if(filter_var(trim($urlline), FILTER_VALIDATE_URL) !== FALSE) {
  84. echo "Crawling $urlline" . PHP_EOL;
  85. crawl_page(trim($urlline), $crawl_depth, $GLOBALS['database']);
  86. }
  87. }
  88. if($argc == "3") {
  89. $urlline = $argv[1];
  90. $fileout = $argv[2];
  91. echo "Two arguments\r\n";
  92. if(filter_var(trim($urlline), FILTER_VALIDATE_URL) !== FALSE) {
  93. echo "Crawling $urlline and saving to $fileout" . PHP_EOL;
  94. crawl_page(trim($urlline), $crawl_depth, $fileout);
  95. }
  96. }
  97. }