arxiv_pdf_renamer.pl 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 13 February 2024
  4. # https://github.com/trizen
  5. # Rename PDFs downloaded from arxiv.org, to their paper title.
  6. # usage: perl script.pl [PDF files]
  7. use 5.036;
  8. use WWW::Mechanize;
  9. use File::Basename qw(dirname basename);
  10. use File::Spec::Functions qw(catfile);
  11. my $mech = WWW::Mechanize->new(
  12. show_progress => 0,
  13. stack_depth => 10,
  14. agent => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
  15. );
  16. foreach my $pdf_file (@ARGV) {
  17. my $pdf_content = do {
  18. open my $fh, '<:raw', $pdf_file
  19. or do {
  20. warn "Can't open file <<$pdf_file>>: $!\n";
  21. next;
  22. };
  23. local $/;
  24. <$fh>;
  25. };
  26. my $url = undef;
  27. if ($pdf_content =~ m{\bURI\s*\((https?://arxiv\.org/.*?)\)}) {
  28. $url = $1;
  29. $url =~ s{^http://}{https://};
  30. }
  31. elsif (basename($pdf_file) =~ /^([0-9]+\.[0-9]+)\.pdf\z/i) {
  32. $url = "https://arxiv.org/abs/$1";
  33. }
  34. my $title = undef;
  35. if (defined($url)) {
  36. my $resp = $mech->get($url);
  37. if ($resp->is_success) {
  38. $title = $resp->title;
  39. }
  40. }
  41. if (defined($title)) {
  42. $title =~ s{\[.*?\]\s*}{};
  43. $title =~ s/: / - /g;
  44. $title =~ tr{:"*/?\\|}{;'+%!%%}; # "
  45. $title =~ tr/<>${}//d;
  46. $title = join(q{ }, split(q{ }, $title));
  47. $title = substr($title, 0, 250); # make sure the filename is not too long
  48. $title .= ".pdf";
  49. my $basename = basename($pdf_file);
  50. say "Renaming: $basename -> $title";
  51. my $dest = catfile(dirname($pdf_file), $title);
  52. if (-e $dest) {
  53. warn "File <<$dest>> already exists... Skipping...\n";
  54. }
  55. else {
  56. rename($pdf_file, $dest) or warn "Failed to rename: $!\n";
  57. }
  58. }
  59. else {
  60. say "Not an arxiv PDF: $pdf_file";
  61. }
  62. }
  63. __END__
  64. # Example:
  65. $ perl arxiv_pdf_renamer.pl *.pdf
  66. ** GET https://arxiv.org/abs/math/0504119v1 ==> 200 OK (1s)
  67. Renaming: 0504119.pdf -> The Carmichael numbers up to 10^17.pdf
  68. ** GET https://arxiv.org/abs/2311.07048v1 ==> 200 OK
  69. Renaming: 2311.07048.pdf -> Gauss-Euler Primality Test.pdf