123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132 |
- #! /usr/bin/perl
- # Copyright (C) 2006 Alex Schroeder <alex@emacswiki.org>
- #
- # This program is free software; you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the
- # Free Software Foundation, Inc.
- # 59 Temple Place, Suite 330
- # Boston, MA 02111-1307 USA
- use Time::ParseDate;
- use Term::ProgressBar;
- use Encode;
- use Unicode::Normalize;
- my $PageDir = 'page';
- my $LogFile = 'access.log';
- my $ReportFile = 'age-vs-popularity.csv';
- my $Now = time;
- my $Verbose = 1;
- # $UrlFilter must match the requested URL, and $1 must be the pagename
- my $FreeLinkPattern = "([-,.()' _0-9A-Za-z\x80-\xff]+)";
- my $UrlFilter = "^/(?:cw|en|de|fr)[/?]$FreeLinkPattern\$";
- warn "URL filter: $UrlFilter\n";
- # namespaces
- # my $InterSitePattern = '[A-Z\x80-\xff]+[A-Za-z\x80-\xff]+';
- sub UrlDecode {
- my $str = shift;
- $str =~ s/%([0-9a-f][0-9a-f])/chr(hex($1))/ge;
- return $str;
- }
- sub ParseLogLine {
- my $line = shift;
- my %result;
- $line =~ m/"(\S+)\s+(\S+)\s+HTTP\/[10.]+"\s+(\d+)/ or die "Cannot parse:\n$_";
- my $type = $1;
- my $url = UrlDecode($2);
- my $code = $3;
- return unless $type eq 'GET';
- return unless $code == 200; # Forget 304 Not Modified
- return $1 if $url =~ m!$UrlFilter!;
- # namespaces
- # return $url if $url =~ m!^/odd/$InterSitePattern/$FreeLinkPattern$!;
- return;
- }
- sub ParseData {
- my $data = shift;
- my %result;
- while ($data =~ /(\S+?): (.*?)(?=\n[^ \t]|\Z)/sg) {
- my ($key, $value) = ($1, $2);
- $value =~ s/\n\t/\n/g;
- $result{$key} = $value;
- }
- return %result;
- }
- my %Age = ();
- my %Hits = ();
- sub ParseLog {
- my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
- $atime,$mtime,$ctime,$blksize,$blocks)
- = stat($LogFile);
- my $progress = Term::ProgressBar->new({name => 'Log',
- count => $size,
- ETA => linear, });
- $progress->max_update_rate(1);
- my $next_update = 0;
- my $count = 0;
- open(F, $LogFile) or die "Cannot read $Logfile: $!";
- while ($_ = <F>) {
- $count += length;
- my $page = ParseLogLine($_);
- next unless $page;
- $Hits{$page}++;
- $next_update = $progress->update($count) if $count++ >= $next_update;
- }
- close(F);
- $progress->update($size) if $size >= $next_update;
- }
- sub ParsePages {
- # include dotfiles!
- my @files = glob("$PageDir/*/*.pg $PageDir/*/.*.pg");
- my $progress = Term::ProgressBar->new({name => 'Pages',
- count => $#files,
- ETA => linear, });
- $progress->max_update_rate(1);
- my $next_update = 0;
- my $count = 0;
- foreach my $file (@files) {
- next unless $file =~ m|/.*/(.+)\.pg$|;
- my $page = encode_utf8(NFC(decode_utf8($1))); # normalize on HFS+ filesystems
- local $/ = undef; # Read complete files
- open(F, $file) or die "Cannot read $page file: $!";
- my $data = <F>;
- close(F);
- my %result = ParseData($data);
- my $days = ($Now - $result{ts}) / (24 * 60 * 60);
- $Age{$page} = $days;
- $next_update = $progress->update($count) if $count++ >= $next_update;
- }
- $progress->update($#files) if $#files >= $next_update;
- }
- sub WriteReport {
- open(F, "> $ReportFile") or die "Cannot write $ReportFile: $!";
- print F "Days,Hits,Name\n";
- for my $page (keys %Age) {
- print F "$Age{$page},$Hits{$page},$page\n";
- }
- close(F);
- }
- ParseLog();
- ParsePages();
- WriteReport();
|