sim_end_words.pl 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. #!/usr/bin/perl
  2. # Daniel "Trizen" Șuteu
  3. # License: GPLv3
  4. # Date: 17 April 2012
  5. # https://github.com/trizen
  6. # Group and list words from a wordlist that have similar ending chars
  7. use strict;
  8. use warnings;
  9. use open IO => ':utf8', ':std';
  10. use Getopt::Long qw(GetOptions);
  11. my $min = 4;
  12. my $max = 15;
  13. my $min_words = 2;
  14. my $max_words = 'inf';
  15. my $unique = 0;
  16. GetOptions(
  17. 'end-min|end_min=i' => \$min,
  18. 'end-max|end_max=i' => \$max,
  19. 'group-min|group_min=i' => \$min_words,
  20. 'group-max|group_max=i' => \$max_words,
  21. 'unique!' => \$unique,
  22. )
  23. or die "Error in command-line arguments!";
  24. @ARGV or die <<"HELP";
  25. usage: $0 [options] wordlists
  26. options:
  27. --end-min=i : minimum number of similar characters (default: $min)
  28. --end-max=i : maximum number of similar characters (default: $max)
  29. --group-min=i : minimum number of words per group (default: $min_words)
  30. --group-max=i : maximum number of words per group (default: $max_words)
  31. --unique! : don't use the same word in different groups (default: $unique)
  32. HELP
  33. --$min; # starting with zero
  34. foreach my $file (grep { -f } @ARGV) {
  35. my %table;
  36. open my $fh, '<', $file or do { warn "$0: can't open file $file: $!"; next };
  37. while (defined(my $line = <$fh>)) {
  38. chomp $line;
  39. next if (my $length = length($line)) <= $min;
  40. --$length; # same as $#chars
  41. my @chars = split //, $line;
  42. for (my $i = $length - $min ; $i >= 0 ; --$i) {
  43. push @{$table{join q{}, @chars[$i .. $length]}}, $line;
  44. }
  45. }
  46. close $fh;
  47. my %data;
  48. my %seen;
  49. {
  50. local $, = "\n";
  51. local $\ = "\n";
  52. foreach my $key (
  53. map { $_->[1] }
  54. sort { $b->[0] <=> $a->[0] }
  55. map { [scalar @{$table{$_}} => $_] } keys %table
  56. ) {
  57. next if length($key) > $max;
  58. @{$table{$key}} = do {
  59. my %s;
  60. grep { !$s{$_}++ } @{$table{$key}};
  61. };
  62. my $items = @{$table{$key}};
  63. next if $items < $min_words;
  64. next if $items > $max_words;
  65. if ($unique) {
  66. @{$table{$key}} = grep { not exists $seen{$_} } @{$table{$key}};
  67. @{$table{$key}} or next;
  68. @seen{@{$table{$key}}} = ();
  69. }
  70. #print "\e[1;46m$key\e[0m";
  71. print "\t\t\t==$key==";
  72. print @{$table{$key}};
  73. }
  74. }
  75. }