magick_similar_images.pl 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. #!/usr/bin/perl
  2. # Author: Daniel "Trizen" Șuteu
  3. # License: GPLv3
  4. # Date: 26 August 2015
  5. # Edit: 25 October 2023
  6. # Website: https://github.com/trizen
  7. # Find images that look similar.
  8. # Blog post:
  9. # https://trizenx.blogspot.com/2015/08/finding-similar-images.html
  10. use 5.022;
  11. use strict;
  12. use warnings;
  13. use experimental 'bitwise';
  14. use Image::Magick qw();
  15. use List::Util qw(sum);
  16. use File::Find qw(find);
  17. use Getopt::Long qw(GetOptions);
  18. my $width = 32;
  19. my $height = 32;
  20. my $percentage = 90;
  21. my $keep_only = undef;
  22. my $img_formats = '';
  23. my $resize_to = $width . 'x' . $height;
  24. my @img_formats = qw(
  25. jpeg
  26. jpg
  27. png
  28. );
  29. sub help {
  30. my ($code) = @_;
  31. local $" = ",";
  32. print <<"EOT";
  33. usage: $0 [options] [dir]
  34. options:
  35. -p --percentage=i : minimum similarity percentage (default: $percentage)
  36. -r --resize-to=s : resize images to this resolution (default: $resize_to)
  37. -f --formats=s,s : specify more image formats (default: @img_formats)
  38. -k --keep=s : keep only the 'smallest' or 'largest' image from each group
  39. WARNING: option '-k' permanently removes your images!
  40. example:
  41. perl $0 -p 75 -r '8x8' ~/Pictures
  42. EOT
  43. exit($code);
  44. }
  45. GetOptions(
  46. 'p|percentage=i' => \$percentage,
  47. 'r|resize-to=s' => \$resize_to,
  48. 'f|formats=s' => \$img_formats,
  49. 'k|keep=s' => \$keep_only,
  50. 'h|help' => sub { help(0) },
  51. )
  52. or die("Error in command line arguments");
  53. ($width, $height) = split(/\h*x\h*/i, $resize_to);
  54. my $size = $width * $height;
  55. push @img_formats, map { quotemeta } split(/\s*,\s*/, $img_formats);
  56. my $img_formats_re = do {
  57. local $" = '|';
  58. qr/\.(@img_formats)\z/i;
  59. };
  60. #<<<
  61. sub alike_percentage {
  62. ((($_[0] ^. $_[1]) =~ tr/\0//) / $size)**2 * 100;
  63. }
  64. #>>>
  65. sub fingerprint {
  66. my ($image) = @_;
  67. my $img = Image::Magick->new;
  68. $img->Read(filename => $image) && return;
  69. $img->AdaptiveResize(width => $width, height => $height) && return; # balanced
  70. ## $img->Resize(width => $width, height => $height) && return; # better, but slower
  71. ## $img->Resample(width => $width, height => $height) && return; # faster, but worse
  72. my @pixels = $img->GetPixels(
  73. map => 'RGB',
  74. x => 0,
  75. y => 0,
  76. width => $width,
  77. height => $height,
  78. normalize => 1,
  79. );
  80. my @averages;
  81. while (@pixels) {
  82. push @averages, sum(splice(@pixels, 0, 3))/3;
  83. }
  84. my $avg = sum(@averages) / @averages;
  85. join('', map { ($_ < $avg) ? 1 : 0 } @averages);
  86. }
  87. sub find_similar_images(&@) {
  88. my $callback = shift;
  89. my @files;
  90. find {
  91. no_chdir => 1,
  92. wanted => sub {
  93. (/$img_formats_re/o && -f) || return;
  94. push @files,
  95. {
  96. fingerprint => fingerprint($_) // return,
  97. filename => $_,
  98. };
  99. }
  100. } => @_;
  101. #
  102. ## Populate the %alike hash
  103. #
  104. my %alike;
  105. foreach my $i (0 .. $#files - 1) {
  106. for (my $j = $i + 1 ; $j <= $#files ; $j++) {
  107. my $p = alike_percentage($files[$i]{fingerprint}, $files[$j]{fingerprint});
  108. if ($p >= $percentage) {
  109. $alike{$files[$i]{filename}}{$files[$j]{filename}} = $p;
  110. $alike{$files[$j]{filename}}{$files[$i]{filename}} = $p;
  111. }
  112. }
  113. }
  114. #
  115. ## Group the files
  116. #
  117. my @alike;
  118. foreach my $root (
  119. map { $_->[0] }
  120. sort { ($a->[1] <=> $b->[1]) || ($b->[2] <=> $a->[2]) }
  121. map {
  122. my $keys = keys(%{$alike{$_}});
  123. my $avg = sum(values(%{$alike{$_}})) / $keys;
  124. [$_, $keys, $avg]
  125. }
  126. keys %alike
  127. ) {
  128. my @group = keys(%{$alike{$root}});
  129. if (@group) {
  130. my $avg = 0;
  131. $avg += delete($alike{$_}{$root}) for @group;
  132. push @alike, {score => $avg / @group, files => [$root, @group]};
  133. }
  134. }
  135. #
  136. ## Callback each group
  137. #
  138. my %seen;
  139. foreach my $group (sort { $b->{score} <=> $a->{score} } @alike) {
  140. (@{$group->{files}} == grep { $seen{$_}++ } @{$group->{files}}) and next;
  141. $callback->($group->{score}, $group->{files});
  142. }
  143. return 1;
  144. }
  145. @ARGV || help(1);
  146. find_similar_images {
  147. my ($score, $files) = @_;
  148. printf("=> Similarity: %.0f%%\n", $score);
  149. say join("\n", sort @{$files});
  150. say "-" x 80;
  151. if (defined($keep_only)) {
  152. my @existent_files = grep { -f $_ } @$files;
  153. scalar(@existent_files) > 1 or return;
  154. my @sorted_by_size = map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { [$_, -s $_] } @existent_files;
  155. if ($keep_only =~ /large/i) {
  156. pop(@sorted_by_size);
  157. }
  158. elsif ($keep_only =~ /small/i) {
  159. shift(@sorted_by_size);
  160. }
  161. else {
  162. die "error: unknown value <<$keep_only>> for option `-k`!\n";
  163. }
  164. foreach my $file (@sorted_by_size) {
  165. say "Removing: $file";
  166. unlink($file) or warn "Failed to remove: $!";
  167. }
  168. }
  169. } @ARGV;