scrape.php 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. #!/usr/bin/php
  2. <?php
  3. require 'vendor/autoload.php';
  4. use \Goutte\Client;
  5. use Symfony\Component\DomCrawler\Crawler;
  6. const sources = [
  7. 'https://notabug.org/diogo/gnu-social' => 'notabug',
  8. // 'https://bugz.foocorp.net/project/view/3/' => 'phabricator',
  9. 'https://bugz.foocorp.net/maniphest/query/clvSMla6p5o3' => 'phabricator',
  10. 'https://git.gnu.io/gnu/gnu-social' => 'gitlab',
  11. 'https://github.com/chimo?tab=repositories&q=gs' => 'github-repo_list'
  12. ];
  13. $issues = [];
  14. function getCachedCrawler(Client &$client, string $url, string $type, string $section, int $num): ?Crawler {
  15. $file = "./cache/{$type}/{$section}/{$num}";
  16. if (!file_exists($file)) {
  17. // HTTP request
  18. echo "Requesting {$url}\n";
  19. $content = @file_get_contents($url);
  20. if ($content === false) {
  21. return null;
  22. }
  23. $dir = dirname($file);
  24. if (!file_exists($dir)) {
  25. mkdir($dir, 0777, $_recursive = true);
  26. }
  27. file_put_contents($file, $content);
  28. } else {
  29. // Cached file
  30. echo "Retrieving cached URI {$url}\n";
  31. $content = file_get_contents($file);
  32. }
  33. return $client->request('GET', $url, $_parameters = [], $_files = [], $_server = [], $content);
  34. }
  35. function getAllIssues(Client &$client, array $sources, string $type): array {
  36. $pages = [];
  37. $users = [];
  38. $issues = [];
  39. foreach ($sources as $source => $type) {
  40. switch ($type) {
  41. case 'notabug':
  42. if ($section = 'issues') {
  43. // foreach (['issues', 'pulls'] as $section) {
  44. for ($i = 1; ; ++$i) {
  45. if ($i == 24) {
  46. // For some reason 24 always returns 500
  47. continue;
  48. }
  49. $host = parse_url($source, PHP_URL_HOST);
  50. $url = "{$source}/{$section}/{$i}";
  51. $path = "./issues/{$host}/{$i}";
  52. if (file_exists($path)) {
  53. echo "Skipping {$url}\n";
  54. continue;
  55. }
  56. $crawler = getCachedCrawler($client, $url, $type, $section, $i);
  57. if (!is_null($crawler)) {
  58. $pages[$url] = "{$type}-{$section}";
  59. } else {
  60. break;
  61. }
  62. if ($crawler->getUri() !== $url) {
  63. // Redirect from issue to pull, ignore
  64. continue;
  65. }
  66. $header = $crawler->filter('div.title');
  67. $title = $crawler->filter('#issue-title')->text();
  68. $status = strtolower($crawler->filter('div.label')->text());
  69. $time = $crawler->filter('span.time-since')->attr('title');
  70. $author = $host . $crawler->filter('span.time-desc > a')->attr('href');
  71. $comments = [];
  72. $crawl_comments = $crawler->filter('ui.comments div.comment');
  73. $crawl_comments->each(function ($comment, $i) use ($source, $host, $url, &$users, &$comments) {
  74. $header = $comment->filter('div.header > span');
  75. $user = $host . $header->first()->attr('href');
  76. $users[] = $user;
  77. $time = $header->filter('a:last-child > span')->attr('title');
  78. $content = $comment->filter('div.raw-content')->text();
  79. $comments[] = ['user' => $user, 'time' => $time, 'content' => $content];
  80. });
  81. $issues[] = [
  82. 'title' => $title,
  83. 'author' => $author,
  84. 'number' => "{$host}/{$i}",
  85. 'time' => $time,
  86. 'status' => $status,
  87. 'comments' => $comments
  88. ];
  89. }
  90. }
  91. break;
  92. // case 'phabricator':
  93. // $crawler = getCachedCrawler($client, $source, $type, 'issues');
  94. // $pages[$source] = "{$type}-issues";
  95. // break;
  96. default:
  97. }
  98. }
  99. // Dump all users to the `user` folder
  100. foreach ($users as $user) {
  101. $path = './user/' . $user;
  102. if (!file_exists($path)) {
  103. mkdir(dirname($path), 0777, $_recursive = true);
  104. touch($path);
  105. }
  106. }
  107. return $issues;
  108. }
  109. $client = new Client();
  110. $issues = getAllIssues($client, sources, '');
  111. foreach ($issues as $issue) {
  112. $path = './issues/' . $issue['number'];
  113. if (!file_exists(dirname($path))) {
  114. mkdir(dirname($path), 0777, $_recursive = true);
  115. }
  116. $content =
  117. "---
  118. title: {$issue['title']}
  119. author: {$issue['author']}
  120. time: {$issue['time']}
  121. status: {$issue['status']}
  122. ---
  123. ";
  124. foreach ($issue['comments'] as $comment) {
  125. $content .= "
  126. author: {$comment['user']}
  127. time: {$comment['time']}
  128. content: -----
  129. {$comment['content']}
  130. -----
  131. ";
  132. }
  133. file_put_contents($path, $content);
  134. }