123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328 |
- #!/usr/bin/perl
- # Author: Daniel "Trizen" Șuteu
- # License: GPLv3
- # Date: 22 March 2015
- # Website: https://github.com/trizen
- # Find similar audio files by comparing their waveforms.
- # Review:
- # https://trizenx.blogspot.com/2015/03/similar-audio-files.html
- #
- ## The waveform is processed block by block:
- # _________________________________________
- # |_____|_____|_____|_____|_____|_____|_____|
- # |_____|_____|_____|_____|_____|_____|_____|
- # |_____|_____|_____|_____|_____|_____|_____|
- # |_____|_____|_____|_____|_____|_____|_____|
- #
- # Each block has a distinct number of white pixels, which are collected
- # inside an array and constitute the unique fingerprint of the waveform.
- #
- # Now, each block value is compared with the corresponding value
- # of another fingerprint. If the difference from all blocks is within
- # the allowed deviation, then the audio files are marked as similar.
- #
- # In the end, the similar files are reported to the standard output.
- # Requirements:
- # - ffmpeg: https://ffmpeg.org/
- # - wav2png: https://github.com/beschulz/wav2png
- use utf8;
- use 5.010;
- use strict;
- use autodie;
- use warnings;
- require GD;
- GD::Image->trueColor(1);
- require GDBM_File;
- use List::Util qw(sum);
- use Getopt::Long qw(GetOptions);
- use File::Find qw(find);
- use File::Temp qw(tempdir);
- use File::Path qw(make_path);
- use File::Spec::Functions qw(catfile catdir);
- require Digest::MD5;
- my $ctx = Digest::MD5->new;
- my $pkgname = 'wave-cmp';
- my $version = 0.01;
- my $deviation = 5;
- my ($width, $height) = (1800, 300);
- my ($div_x, $div_y) = (10, 2);
- sub help {
- my ($code) = @_;
- print <<"EOT";
- usage: $0 [options] [dirs|files]
- => Waveform generation
- -w --width=i : width of the waveform (default: $width)
- -h --height=i : height of the waveform (default: $height)
- => Waveform processing
- -x --x-div=i : divisions along the X-axis (default: $div_x)
- -y --y-div=i : divisions along the Y-axis (default: $div_y)
- -d --deviation=i : tolerance deviation value (default: $deviation)
- --help : print this message and exit
- --version : print the version number and exit
- example:
- $0 --deviation=6 ~/Music
- EOT
- exit($code);
- }
- sub version {
- print "$pkgname $version\n";
- exit 0;
- }
- GetOptions(
- 'w|width=i' => \$width,
- 'h|height=i' => \$height,
- 'x|x-div=i' => \$div_x,
- 'y|y-div=i' => \$div_y,
- 'd|deviation=i' => \$deviation,
- 'help' => sub { help(0) },
- 'v|version' => \&version,
- )
- or die("Error in command line arguments");
- my $sq_x = int($width / $div_x);
- my $sq_y = int($height / $div_y);
- my $limit_x = $width - $sq_x;
- my $limit_y = int($height / 2) - $sq_y; # analyze only the first half
- # Source: https://en.wikipedia.org/wiki/Audio_file_format#List_of_formats
- my @audio_formats = qw(
- 3gp
- act
- aiff
- aac
- amr
- au
- awb
- dct
- dss
- flac
- gsm
- m4a
- m4p
- mp3
- mpc
- ogg oga
- opus
- ra rm
- raw
- sln
- tta
- vox
- wav
- wma
- wv
- webm
- );
- my $audio_formats_re = do {
- local $" = '|';
- qr/\.(?:@audio_formats)\z/i;
- };
- my $home_dir =
- $ENV{HOME}
- || $ENV{LOGDIR}
- || (getpwuid($<))[7]
- || `echo -n ~`;
- my $xdg_config_home = catdir($home_dir, '.config');
- my $cache_dir = catdir($xdg_config_home, $pkgname);
- my $cache_db = catfile($cache_dir, 'fp.db');
- if (not -d $cache_dir) {
- make_path($cache_dir);
- }
- my $tmpdir = tempdir(CLEANUP => 1);
- tie my %db, 'GDBM_File', $cache_db, &GDBM_File::GDBM_WRCREAT, 0640;
- #
- #-- execute the ffmpeg and wave2png commands and return the waveform PNG data
- #
- sub generate_waveform {
- my ($file, $output) = @_;
- #<<<
- # Using sox (currently broken)
- # return scalar `sox \Q$file\E -q --norm -V0 --multi-threaded -t wav --encoding signed-integer - | wav2png -w $width -h $height -f ffffffff -b 00000000 -o /dev/stdout /dev/stdin`;
- #>>>
- my $tmpfile = catfile($tmpdir, $file . '.wav');
- system("ffmpeg", "-loglevel", "quiet", "-i", $file, $tmpfile);
- $? == 0 or return;
- my $waveform = `wav2png -w $width -h $height -f 000000ff -b ffffff00 -o /dev/stdout \Q$tmpfile\E`;
- unlink($tmpfile);
- return $waveform;
- }
- #
- #-- return the md5 hex digest of the content of a file
- #
- sub md5_file {
- my ($file) = @_;
- open my $fh, '<:raw', $file;
- $ctx->addfile($fh);
- $ctx->hexdigest;
- }
- #
- #-- take image data as input and return a fingerprint array ref
- #
- sub generate_fingerprint {
- my ($image_data) = @_;
- $image_data eq '' and return;
- state %rgb_cache; # cache the RGB values of pixels
- my @fingerprint;
- my $image = GD::Image->new($image_data) // return;
- for (my $i = 0 ; $i <= $limit_x ; $i += $sq_x) {
- for (my $j = 0 ; $j <= $limit_y ; $j += $sq_y) {
- my $fill = 0;
- foreach my $x ($i .. $i + $sq_x - 1) {
- foreach my $y ($j .. $j + $sq_y - 1) {
- my $index = $image->getPixel($x, $y);
- my $rgb = $rgb_cache{$index} //= [$image->rgb($index)];
- $fill++ if $rgb->[0] == 255; # check only the value of red
- }
- }
- push @fingerprint, $fill;
- }
- }
- return \@fingerprint;
- }
- #
- #-- fetch or generate the fingerprint for a given audio file
- #
- sub fingerprint {
- my ($audio_file) = @_;
- state $local_cache = {};
- return $local_cache->{$audio_file}
- if exists $local_cache->{$audio_file};
- my $md5 = md5_file($audio_file);
- my $key = "$width/$height/$div_x/$div_y/$md5";
- if (not exists $db{$key}) {
- my $image_data = generate_waveform($audio_file) // return;
- my $fingerprint = generate_fingerprint($image_data) // return;
- $db{$key} = join(':', @{$fingerprint});
- return ($local_cache->{$audio_file} = $fingerprint);
- }
- $local_cache->{$audio_file} //= [split /:/, $db{$key}];
- }
- #
- #-- compare two fingerprints and return true if they are alike
- #
- sub alike_fingerprints {
- my ($a1, $a2) = @_;
- foreach my $i (0 .. $#{$a1}) {
- my $value = abs($a1->[$i] - $a2->[$i]) / ($sq_x * $sq_y) * 100;
- return if $value > $deviation;
- }
- return 1;
- }
- #
- #-- compare two audio files and return true if they are alike
- #
- sub alike_files {
- my ($file1, $file2) = @_;
- my $fp1 = fingerprint($file1) // return;
- my $fp2 = fingerprint($file2) // return;
- alike_fingerprints($fp1, $fp2);
- }
- #
- #-- find and call $code with a group of similar audio files
- #
- sub find_similar_audio_files {
- my $code = shift;
- my @files;
- find {
- no_chdir => 1,
- wanted => sub {
- /$audio_formats_re/ || return;
- lstat;
- (-f _) && (not -l _) && push @files, $_;
- }
- } => @_;
- my %groups;
- my %seen;
- my $limit = $#files;
- foreach my $i (0 .. $limit) {
- foreach my $j ($i + 1 .. $limit) {
- next if $seen{$files[$j]};
- if (alike_files($files[$i], $files[$j])) {
- $groups{$i} //= [$files[$i]];
- $seen{$files[$j]}++;
- push @{$groups{$i}}, $files[$j];
- }
- }
- if (exists $groups{$i}) {
- $code->(delete $groups{$i});
- }
- }
- }
- #
- #-- print a group of files followed by an horizontal line
- #
- sub print_group {
- my ($group) = @_;
- foreach my $file (sort { (lc($a) cmp lc($b)) || ($a cmp $b) } @{$group}) {
- say $file;
- }
- say "-" x 80;
- }
- @ARGV || help(2);
- find_similar_audio_files(\&print_group, @ARGV);
|