age-vs-popularity 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. #! /usr/bin/perl
  2. # Copyright (C) 2006 Alex Schroeder <alex@emacswiki.org>
  3. #
  4. # This program is free software; you can redistribute it and/or modify
  5. # it under the terms of the GNU General Public License as published by
  6. # the Free Software Foundation; either version 2 of the License, or
  7. # (at your option) any later version.
  8. #
  9. # This program is distributed in the hope that it will be useful,
  10. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. # GNU General Public License for more details.
  13. #
  14. # You should have received a copy of the GNU General Public License
  15. # along with this program; if not, write to the
  16. # Free Software Foundation, Inc.
  17. # 59 Temple Place, Suite 330
  18. # Boston, MA 02111-1307 USA
  19. use Time::ParseDate;
  20. use Term::ProgressBar;
  21. use Encode;
  22. use Unicode::Normalize;
  23. my $PageDir = 'page';
  24. my $LogFile = 'access.log';
  25. my $ReportFile = 'age-vs-popularity.csv';
  26. my $Now = time;
  27. my $Verbose = 1;
  28. # $UrlFilter must match the requested URL, and $1 must be the pagename
  29. my $FreeLinkPattern = "([-,.()' _0-9A-Za-z\x80-\xff]+)";
  30. my $UrlFilter = "^/(?:cw|en|de|fr)[/?]$FreeLinkPattern\$";
  31. warn "URL filter: $UrlFilter\n";
  32. # namespaces
  33. # my $InterSitePattern = '[A-Z\x80-\xff]+[A-Za-z\x80-\xff]+';
  34. sub UrlDecode {
  35. my $str = shift;
  36. $str =~ s/%([0-9a-f][0-9a-f])/chr(hex($1))/ge;
  37. return $str;
  38. }
  39. sub ParseLogLine {
  40. my $line = shift;
  41. my %result;
  42. $line =~ m/"(\S+)\s+(\S+)\s+HTTP\/[10.]+"\s+(\d+)/ or die "Cannot parse:\n$_";
  43. my $type = $1;
  44. my $url = UrlDecode($2);
  45. my $code = $3;
  46. return unless $type eq 'GET';
  47. return unless $code == 200; # Forget 304 Not Modified
  48. return $1 if $url =~ m!$UrlFilter!;
  49. # namespaces
  50. # return $url if $url =~ m!^/odd/$InterSitePattern/$FreeLinkPattern$!;
  51. return;
  52. }
  53. sub ParseData {
  54. my $data = shift;
  55. my %result;
  56. while ($data =~ /(\S+?): (.*?)(?=\n[^ \t]|\Z)/sg) {
  57. my ($key, $value) = ($1, $2);
  58. $value =~ s/\n\t/\n/g;
  59. $result{$key} = $value;
  60. }
  61. return %result;
  62. }
  63. my %Age = ();
  64. my %Hits = ();
  65. sub ParseLog {
  66. my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
  67. $atime,$mtime,$ctime,$blksize,$blocks)
  68. = stat($LogFile);
  69. my $progress = Term::ProgressBar->new({name => 'Log',
  70. count => $size,
  71. ETA => linear, });
  72. $progress->max_update_rate(1);
  73. my $next_update = 0;
  74. my $count = 0;
  75. open(F, $LogFile) or die "Cannot read $Logfile: $!";
  76. while ($_ = <F>) {
  77. $count += length;
  78. my $page = ParseLogLine($_);
  79. next unless $page;
  80. $Hits{$page}++;
  81. $next_update = $progress->update($count) if $count++ >= $next_update;
  82. }
  83. close(F);
  84. $progress->update($size) if $size >= $next_update;
  85. }
  86. sub ParsePages {
  87. # include dotfiles!
  88. my @files = glob("$PageDir/*/*.pg $PageDir/*/.*.pg");
  89. my $progress = Term::ProgressBar->new({name => 'Pages',
  90. count => $#files,
  91. ETA => linear, });
  92. $progress->max_update_rate(1);
  93. my $next_update = 0;
  94. my $count = 0;
  95. foreach my $file (@files) {
  96. next unless $file =~ m|/.*/(.+)\.pg$|;
  97. my $page = encode_utf8(NFC(decode_utf8($1))); # normalize on HFS+ filesystems
  98. local $/ = undef; # Read complete files
  99. open(F, $file) or die "Cannot read $page file: $!";
  100. my $data = <F>;
  101. close(F);
  102. my %result = ParseData($data);
  103. my $days = ($Now - $result{ts}) / (24 * 60 * 60);
  104. $Age{$page} = $days;
  105. $next_update = $progress->update($count) if $count++ >= $next_update;
  106. }
  107. $progress->update($#files) if $#files >= $next_update;
  108. }
  109. sub WriteReport {
  110. open(F, "> $ReportFile") or die "Cannot write $ReportFile: $!";
  111. print F "Days,Hits,Name\n";
  112. for my $page (keys %Age) {
  113. print F "$Age{$page},$Hits{$page},$page\n";
  114. }
  115. close(F);
  116. }
  117. ParseLog();
  118. ParsePages();
  119. WriteReport();