123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159 |
- #!/usr/bin/php
- <?php
- require 'vendor/autoload.php';
- use \Goutte\Client;
- use Symfony\Component\DomCrawler\Crawler;
- const sources = [
- 'https://notabug.org/diogo/gnu-social' => 'notabug',
- // 'https://bugz.foocorp.net/project/view/3/' => 'phabricator',
- 'https://bugz.foocorp.net/maniphest/query/clvSMla6p5o3' => 'phabricator',
- 'https://git.gnu.io/gnu/gnu-social' => 'gitlab',
- 'https://github.com/chimo?tab=repositories&q=gs' => 'github-repo_list'
- ];
- $issues = [];
- function getCachedCrawler(Client &$client, string $url, string $type, string $section, int $num): ?Crawler {
- $file = "./cache/{$type}/{$section}/{$num}";
- if (!file_exists($file)) {
- // HTTP request
- echo "Requesting {$url}\n";
- $content = @file_get_contents($url);
- if ($content === false) {
- return null;
- }
- $dir = dirname($file);
- if (!file_exists($dir)) {
- mkdir($dir, 0777, $_recursive = true);
- }
- file_put_contents($file, $content);
- } else {
- // Cached file
- echo "Retrieving cached URI {$url}\n";
- $content = file_get_contents($file);
- }
- return $client->request('GET', $url, $_parameters = [], $_files = [], $_server = [], $content);
- }
- function getAllIssues(Client &$client, array $sources, string $type): array {
- $pages = [];
- $users = [];
- $issues = [];
- foreach ($sources as $source => $type) {
- switch ($type) {
- case 'notabug':
- if ($section = 'issues') {
- // foreach (['issues', 'pulls'] as $section) {
- for ($i = 1; ; ++$i) {
- if ($i == 24) {
- // For some reason 24 always returns 500
- continue;
- }
- $host = parse_url($source, PHP_URL_HOST);
- $url = "{$source}/{$section}/{$i}";
- $path = "./issues/{$host}/{$i}";
- if (file_exists($path)) {
- echo "Skipping {$url}\n";
- continue;
- }
- $crawler = getCachedCrawler($client, $url, $type, $section, $i);
- if (!is_null($crawler)) {
- $pages[$url] = "{$type}-{$section}";
- } else {
- break;
- }
- if ($crawler->getUri() !== $url) {
- // Redirect from issue to pull, ignore
- continue;
- }
- $header = $crawler->filter('div.title');
- $title = $crawler->filter('#issue-title')->text();
- $status = strtolower($crawler->filter('div.label')->text());
- $time = $crawler->filter('span.time-since')->attr('title');
- $author = $host . $crawler->filter('span.time-desc > a')->attr('href');
- $comments = [];
- $crawl_comments = $crawler->filter('ui.comments div.comment');
- $crawl_comments->each(function ($comment, $i) use ($source, $host, $url, &$users, &$comments) {
- $header = $comment->filter('div.header > span');
- $user = $host . $header->first()->attr('href');
- $users[] = $user;
- $time = $header->filter('a:last-child > span')->attr('title');
- $content = $comment->filter('div.raw-content')->text();
- $comments[] = ['user' => $user, 'time' => $time, 'content' => $content];
- });
- $issues[] = [
- 'title' => $title,
- 'author' => $author,
- 'number' => "{$host}/{$i}",
- 'time' => $time,
- 'status' => $status,
- 'comments' => $comments
- ];
- }
- }
- break;
- // case 'phabricator':
- // $crawler = getCachedCrawler($client, $source, $type, 'issues');
- // $pages[$source] = "{$type}-issues";
- // break;
- default:
- }
- }
- // Dump all users to the `user` folder
- foreach ($users as $user) {
- $path = './user/' . $user;
- if (!file_exists($path)) {
- mkdir(dirname($path), 0777, $_recursive = true);
- touch($path);
- }
- }
- return $issues;
- }
- $client = new Client();
- $issues = getAllIssues($client, sources, '');
- foreach ($issues as $issue) {
- $path = './issues/' . $issue['number'];
- if (!file_exists(dirname($path))) {
- mkdir(dirname($path), 0777, $_recursive = true);
- }
- $content =
- "---
- title: {$issue['title']}
- author: {$issue['author']}
- time: {$issue['time']}
- status: {$issue['status']}
- ---
- ";
- foreach ($issue['comments'] as $comment) {
- $content .= "
- author: {$comment['user']}
- time: {$comment['time']}
- content: -----
- {$comment['content']}
- -----
- ";
- }
- file_put_contents($path, $content);
- }
|