regex-dna.pl 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. # The Computer Language Benchmarks Game
  2. # http://shootout.alioth.debian.org/
  3. # contributed by Danny Sauer
  4. # completely rewritten and
  5. # cleaned up for speed and fun by Mirco Wahab
  6. # improved STDIN read, regex clean up by Jake Berner
  7. use strict;
  8. use warnings;
  9. my $l_file = -s STDIN;
  10. my $content; read STDIN, $content, $l_file;
  11. # this is significantly faster than using <> in this case
  12. my $dispose = qr/(^>.*)?\n/m; # slight performance gain here
  13. $content =~ s/$dispose//g;
  14. my $l_code = length $content;
  15. my @seq = ( 'agggtaaa|tttaccct',
  16. '[cgt]gggtaaa|tttaccc[acg]',
  17. 'a[act]ggtaaa|tttacc[agt]t',
  18. 'ag[act]gtaaa|tttac[agt]ct',
  19. 'agg[act]taaa|ttta[agt]cct',
  20. 'aggg[acg]aaa|ttt[cgt]ccct',
  21. 'agggt[cgt]aa|tt[acg]accct',
  22. 'agggta[cgt]a|t[acg]taccct',
  23. 'agggtaa[cgt]|[acg]ttaccct' );
  24. my @cnt = (0) x @seq;
  25. for my $k (0..$#seq) {
  26. ++$cnt[$k] while $content=~/$seq[$k]/gi;
  27. printf "$seq[$k] $cnt[$k]\n"
  28. }
  29. my %iub = ( B => '(c|g|t)', D => '(a|g|t)',
  30. H => '(a|c|t)', K => '(g|t)', M => '(a|c)',
  31. N => '(a|c|g|t)', R => '(a|g)', S => '(c|g)',
  32. V => '(a|c|g)', W => '(a|t)', Y => '(c|t)' );
  33. # using $& and no submatch marginally improves the
  34. # speed here, but mentioning $& causes perl to
  35. # define that value for the @seq patterns too, which
  36. # slows those down considerably. No change.
  37. my $findiub = '(['.(join '', keys %iub).'])';
  38. $content =~ s/$findiub/$iub{$1}/g;
  39. printf "\n%d\n%d\n%d\n", $l_file, $l_code, length $content;