imager_similar_images.pl 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. #!/usr/bin/perl
  2. # Author: Daniel "Trizen" Șuteu
  3. # License: GPLv3
  4. # Date: 26 August 2015
  5. # Edit: 24 October 2023
  6. # Website: https://github.com/trizen
  7. # Find images that look similar.
  8. # Blog post:
  9. # https://trizenx.blogspot.com/2015/08/finding-similar-images.html
  10. use 5.022;
  11. use strict;
  12. use warnings;
  13. use experimental qw(bitwise);
  14. use Imager qw();
  15. use List::Util qw(sum);
  16. use File::Find qw(find);
  17. use Getopt::Long qw(GetOptions);
  18. my $width = 32;
  19. my $height = 'auto';
  20. my $percentage = 90;
  21. my $keep_only = undef;
  22. my $img_formats = '';
  23. my @img_formats = qw(
  24. jpeg
  25. jpg
  26. png
  27. );
  28. sub help {
  29. my ($code) = @_;
  30. local $" = ",";
  31. print <<"EOT";
  32. usage: $0 [options] [dir]
  33. options:
  34. -p --percentage=i : minimum similarity percentage (default: $percentage)
  35. -w --width=i : resize images to this width (default: $width)
  36. -h --height=i : resize images to this height (default: $height)
  37. -f --formats=s,s : specify more image formats (default: @img_formats)
  38. -k --keep=s : keep only the 'smallest' or 'largest' image from each group
  39. WARNING: option '-k' permanently removes your images!
  40. example:
  41. perl $0 -p 75 -r '8x8' ~/Pictures
  42. EOT
  43. exit($code);
  44. }
  45. GetOptions(
  46. 'p|percentage=i' => \$percentage,
  47. 'w|width=s' => \$width,
  48. 'h|height=s' => \$height,
  49. 'f|formats=s' => \$img_formats,
  50. 'k|keep=s' => \$keep_only,
  51. )
  52. or die("Error in command line arguments");
  53. push @img_formats, map { quotemeta } split(/\s*,\s*/, $img_formats);
  54. my $img_formats_re = do {
  55. local $" = '|';
  56. qr/\.(@img_formats)\z/i;
  57. };
  58. #<<<
  59. sub alike_percentage {
  60. ((($_[0] ^. $_[1]) =~ tr/\0//) / $_[2])**2 * 100;
  61. }
  62. #>>>
  63. sub fingerprint {
  64. my ($image) = @_;
  65. my $img = Imager->new(file => $image) or do {
  66. warn "Failed to load <<$image>>: ", Imager->errstr();
  67. return;
  68. };
  69. if ($height ne 'auto') {
  70. $img = $img->scale(ypixels => $height, qtype => 'preview');
  71. }
  72. else {
  73. $img = $img->scale(xpixels => $width, qtype => 'preview');
  74. }
  75. my ($curr_width, $curr_height) = ($img->getwidth, $img->getheight);
  76. my @averages;
  77. foreach my $y (0 .. $curr_height - 1) {
  78. my @line = $img->getscanline(y => $y);
  79. foreach my $pixel (@line) {
  80. my ($R, $G, $B) = $pixel->rgba;
  81. push @averages, sum($R, $G, $B) / 3;
  82. }
  83. }
  84. my $avg = sum(@averages) / @averages;
  85. [join('', map { ($_ < $avg) ? 1 : 0 } @averages), $curr_width, $curr_height];
  86. }
  87. sub find_similar_images(&@) {
  88. my $callback = shift;
  89. my @files;
  90. find {
  91. no_chdir => 1,
  92. wanted => sub {
  93. (/$img_formats_re/o && -f) || return;
  94. push @files,
  95. {
  96. fingerprint => fingerprint($_) // return,
  97. filename => $_,
  98. };
  99. }
  100. } => @_;
  101. #
  102. ## Populate the %alike hash
  103. #
  104. my %alike;
  105. foreach my $i (0 .. $#files - 1) {
  106. for (my $j = $i + 1 ; $j <= $#files ; $j++) {
  107. my $p = alike_percentage(
  108. $files[$i]{fingerprint}->[0],
  109. $files[$j]{fingerprint}->[0],
  110. sqrt($files[$i]{fingerprint}->[1] * $files[$j]{fingerprint}->[1]) * sqrt($files[$i]{fingerprint}->[2] * $files[$j]{fingerprint}->[2])
  111. );
  112. if ($p >= $percentage) {
  113. $alike{$files[$i]{filename}}{$files[$j]{filename}} = $p;
  114. $alike{$files[$j]{filename}}{$files[$i]{filename}} = $p;
  115. }
  116. }
  117. }
  118. #
  119. ## Group the files
  120. #
  121. my @alike;
  122. foreach my $root (
  123. map { $_->[0] }
  124. sort { ($a->[1] <=> $b->[1]) || ($b->[2] <=> $a->[2]) }
  125. map {
  126. my $keys = keys(%{$alike{$_}});
  127. my $avg = sum(values(%{$alike{$_}})) / $keys;
  128. [$_, $keys, $avg]
  129. }
  130. keys %alike
  131. ) {
  132. my @group = keys(%{$alike{$root}});
  133. if (@group) {
  134. my $avg = 0;
  135. $avg += delete($alike{$_}{$root}) for @group;
  136. push @alike, {score => $avg / @group, files => [$root, @group]};
  137. }
  138. }
  139. #
  140. ## Callback each group
  141. #
  142. my %seen;
  143. foreach my $group (sort { $b->{score} <=> $a->{score} } @alike) {
  144. (@{$group->{files}} == grep { $seen{$_}++ } @{$group->{files}}) and next;
  145. $callback->($group->{score}, $group->{files});
  146. }
  147. return 1;
  148. }
  149. @ARGV || help(1);
  150. find_similar_images {
  151. my ($score, $files) = @_;
  152. printf("=> Similarity: %.0f%%\n", $score);
  153. say join("\n", sort @{$files});
  154. say "-" x 80;
  155. if (defined($keep_only)) {
  156. my @existent_files = grep { -f $_ } @$files;
  157. scalar(@existent_files) > 1 or return;
  158. my @sorted_by_size = map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { [$_, -s $_] } @existent_files;
  159. if ($keep_only =~ /large/i) {
  160. pop(@sorted_by_size);
  161. }
  162. elsif ($keep_only =~ /small/i) {
  163. shift(@sorted_by_size);
  164. }
  165. else {
  166. die "error: unknown value <<$keep_only>> for option `-k`!\n";
  167. }
  168. foreach my $file (@sorted_by_size) {
  169. say "Removing: $file";
  170. unlink($file) or warn "Failed to remove: $!";
  171. }
  172. }
  173. } @ARGV;