gd_similar_images.pl 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. #!/usr/bin/perl
  2. # Author: Daniel "Trizen" Șuteu
  3. # License: GPLv3
  4. # Date: 26 August 2015
  5. # Edit: 24 October 2023
  6. # Website: https://github.com/trizen
  7. # Find images that look similar.
  8. # Blog post:
  9. # https://trizenx.blogspot.com/2015/08/finding-similar-images.html
  10. use 5.022;
  11. use strict;
  12. use warnings;
  13. use experimental 'bitwise';
  14. use GD qw();
  15. use List::Util qw(sum);
  16. use File::Find qw(find);
  17. use Getopt::Long qw(GetOptions);
  18. GD::Image->trueColor(1);
  19. my $width = 32;
  20. my $height = 32;
  21. my $percentage = 90;
  22. my $keep_only = undef;
  23. my $img_formats = '';
  24. my $resize_to = $width . 'x' . $height;
  25. my @img_formats = qw(
  26. jpeg
  27. jpg
  28. png
  29. );
  30. sub help {
  31. my ($code) = @_;
  32. local $" = ",";
  33. print <<"EOT";
  34. usage: $0 [options] [dir]
  35. options:
  36. -p --percentage=i : minimum similarity percentage (default: $percentage)
  37. -r --resize-to=s : resize images to this resolution (default: $resize_to)
  38. -f --formats=s,s : specify more image formats (default: @img_formats)
  39. -k --keep=s : keep only the 'smallest' or 'largest' image from each group
  40. WARNING: option '-k' permanently removes your images!
  41. example:
  42. perl $0 -p 75 -r '8x8' ~/Pictures
  43. EOT
  44. exit($code);
  45. }
  46. GetOptions(
  47. 'p|percentage=i' => \$percentage,
  48. 'r|resize-to=s' => \$resize_to,
  49. 'f|formats=s' => \$img_formats,
  50. 'k|keep=s' => \$keep_only,
  51. 'h|help' => sub { help(0) },
  52. )
  53. or die("Error in command line arguments");
  54. ($width, $height) = split(/\h*x\h*/i, $resize_to);
  55. my $size = $width * $height;
  56. push @img_formats, map { quotemeta } split(/\s*,\s*/, $img_formats);
  57. my $img_formats_re = do {
  58. local $" = '|';
  59. qr/\.(@img_formats)\z/i;
  60. };
  61. #<<<
  62. sub alike_percentage {
  63. ((($_[0] ^. $_[1]) =~ tr/\0//) / $size)**2 * 100;
  64. }
  65. #>>>
  66. sub fingerprint {
  67. my ($image) = @_;
  68. my $img = GD::Image->new($image) // return;
  69. {
  70. my $resized = GD::Image->new($width, $height);
  71. $resized->copyResampled($img, 0, 0, 0, 0, $width, $height, $img->getBounds());
  72. $img = $resized;
  73. }
  74. my @averages;
  75. foreach my $y (0 .. $height - 1) {
  76. foreach my $x (0 .. $width - 1) {
  77. push @averages, sum($img->rgb($img->getPixel($x, $y)))/3;
  78. }
  79. }
  80. my $avg = sum(@averages) / @averages;
  81. join('', map { ($_ < $avg) ? 1 : 0 } @averages);
  82. }
  83. sub find_similar_images(&@) {
  84. my $callback = shift;
  85. my @files;
  86. find {
  87. no_chdir => 1,
  88. wanted => sub {
  89. (/$img_formats_re/o && -f) || return;
  90. push @files,
  91. {
  92. fingerprint => fingerprint($_) // return,
  93. filename => $_,
  94. };
  95. }
  96. } => @_;
  97. #
  98. ## Populate the %alike hash
  99. #
  100. my %alike;
  101. foreach my $i (0 .. $#files - 1) {
  102. for (my $j = $i + 1 ; $j <= $#files ; $j++) {
  103. my $p = alike_percentage($files[$i]{fingerprint}, $files[$j]{fingerprint});
  104. if ($p >= $percentage) {
  105. $alike{$files[$i]{filename}}{$files[$j]{filename}} = $p;
  106. $alike{$files[$j]{filename}}{$files[$i]{filename}} = $p;
  107. }
  108. }
  109. }
  110. #
  111. ## Group the files
  112. #
  113. my @alike;
  114. foreach my $root (
  115. map { $_->[0] }
  116. sort { ($a->[1] <=> $b->[1]) || ($b->[2] <=> $a->[2]) }
  117. map {
  118. my $keys = keys(%{$alike{$_}});
  119. my $avg = sum(values(%{$alike{$_}})) / $keys;
  120. [$_, $keys, $avg]
  121. }
  122. keys %alike
  123. ) {
  124. my @group = keys(%{$alike{$root}});
  125. if (@group) {
  126. my $avg = 0;
  127. $avg += delete($alike{$_}{$root}) for @group;
  128. push @alike, {score => $avg / @group, files => [$root, @group]};
  129. }
  130. }
  131. #
  132. ## Callback each group
  133. #
  134. my %seen;
  135. foreach my $group (sort { $b->{score} <=> $a->{score} } @alike) {
  136. (@{$group->{files}} == grep { $seen{$_}++ } @{$group->{files}}) and next;
  137. $callback->($group->{score}, $group->{files});
  138. }
  139. return 1;
  140. }
  141. @ARGV || help(1);
  142. find_similar_images {
  143. my ($score, $files) = @_;
  144. printf("=> Similarity: %.0f%%\n", $score);
  145. say join("\n", sort @{$files});
  146. say "-" x 80;
  147. if (defined($keep_only)) {
  148. my @existent_files = grep { -f $_ } @$files;
  149. scalar(@existent_files) > 1 or return;
  150. my @sorted_by_size = map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { [$_, -s $_] } @existent_files;
  151. if ($keep_only =~ /large/i) {
  152. pop(@sorted_by_size);
  153. }
  154. elsif ($keep_only =~ /small/i) {
  155. shift(@sorted_by_size);
  156. }
  157. else {
  158. die "error: unknown value <<$keep_only>> for option `-k`!\n";
  159. }
  160. foreach my $file (@sorted_by_size) {
  161. say "Removing: $file";
  162. unlink($file) or warn "Failed to remove: $!";
  163. }
  164. }
  165. } @ARGV;