123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305 |
- #!/usr/bin/perl
- # Author: Trizen
- # Date: 07 September 2023
- # Edit: 11 April 2024
- # https://github.com/trizen
- # Compress/decompress files using LZ77 compression + DEFLATE integers encoding + Burrows-Wheeler Transform (BWT) + Huffman coding.
- # References:
- # Data Compression (Summer 2023) - Lecture 13 - BZip2
- # https://youtube.com/watch?v=cvoZbBZ3M2A
- #
- # Data Compression (Summer 2023) - Lecture 11 - DEFLATE (gzip)
- # https://youtube.com/watch?v=SJPvNi4HrWQ
- use 5.036;
- use Getopt::Std qw(getopts);
- use File::Basename qw(basename);
- use Compression::Util qw(:all);
- use constant {
- PKGNAME => 'LZBWD',
- VERSION => '0.01',
- FORMAT => 'lzbwd',
- COMPRESSED_BYTE => chr(1),
- UNCOMPRESSED_BYTE => chr(0),
- CHUNK_SIZE => 1 << 16, # higher value = better compression
- RANDOM_DATA_THRESHOLD => 1, # in ratio
- MAX_INT => oct('0b' . ('1' x 32)),
- };
- # Container signature
- use constant SIGNATURE => uc(FORMAT) . chr(1);
- # [distance value, offset bits]
- my @DISTANCE_SYMBOLS = (map { [$_, 0] } 0 .. 4);
- until ($DISTANCE_SYMBOLS[-1][0] > MAX_INT) {
- push @DISTANCE_SYMBOLS, [int($DISTANCE_SYMBOLS[-1][0] * (4 / 3)), $DISTANCE_SYMBOLS[-1][1] + 1];
- push @DISTANCE_SYMBOLS, [int($DISTANCE_SYMBOLS[-1][0] * (3 / 2)), $DISTANCE_SYMBOLS[-1][1]];
- }
- sub usage {
- my ($code) = @_;
- print <<"EOH";
- usage: $0 [options] [input file] [output file]
- options:
- -e : extract
- -i <filename> : input filename
- -o <filename> : output filename
- -r : rewrite output
- -v : version number
- -h : this message
- examples:
- $0 document.txt
- $0 document.txt archive.${\FORMAT}
- $0 archive.${\FORMAT} document.txt
- $0 -e -i archive.${\FORMAT} -o document.txt
- EOH
- exit($code // 0);
- }
- sub version {
- printf("%s %s\n", PKGNAME, VERSION);
- exit;
- }
- sub valid_archive {
- my ($fh) = @_;
- if (read($fh, (my $sig), length(SIGNATURE), 0) == length(SIGNATURE)) {
- $sig eq SIGNATURE || return;
- }
- return 1;
- }
- sub main {
- my %opt;
- getopts('ei:o:vhr', \%opt);
- $opt{h} && usage(0);
- $opt{v} && version();
- my ($input, $output) = @ARGV;
- $input //= $opt{i} // usage(2);
- $output //= $opt{o};
- my $ext = qr{\.${\FORMAT}\z}io;
- if ($opt{e} || $input =~ $ext) {
- if (not defined $output) {
- ($output = basename($input)) =~ s{$ext}{}
- || die "$0: no output file specified!\n";
- }
- if (not $opt{r} and -e $output) {
- print "'$output' already exists! -- Replace? [y/N] ";
- <STDIN> =~ /^y/i || exit 17;
- }
- decompress_file($input, $output)
- || die "$0: error: decompression failed!\n";
- }
- elsif ($input !~ $ext || (defined($output) && $output =~ $ext)) {
- $output //= basename($input) . '.' . FORMAT;
- compress_file($input, $output)
- || die "$0: error: compression failed!\n";
- }
- else {
- warn "$0: don't know what to do...\n";
- usage(1);
- }
- }
- sub encode_integers ($integers) {
- my @symbols;
- my $offset_bits = '';
- foreach my $dist (@$integers) {
- foreach my $i (0 .. $#DISTANCE_SYMBOLS) {
- if ($DISTANCE_SYMBOLS[$i][0] > $dist) {
- push @symbols, $i - 1;
- if ($DISTANCE_SYMBOLS[$i - 1][1] > 0) {
- $offset_bits .= sprintf('%0*b', $DISTANCE_SYMBOLS[$i - 1][1], $dist - $DISTANCE_SYMBOLS[$i - 1][0]);
- }
- last;
- }
- }
- }
- return (pack('C*', @symbols), pack('B*', $offset_bits));
- }
- sub decode_integers ($symbols, $fh) {
- my $bits_len = 0;
- foreach my $i (@$symbols) {
- $bits_len += $DISTANCE_SYMBOLS[$i][1];
- }
- my $bits = read_bits($fh, $bits_len);
- my @distances;
- foreach my $i (@$symbols) {
- push @distances, $DISTANCE_SYMBOLS[$i][0] + oct('0b' . substr($bits, 0, $DISTANCE_SYMBOLS[$i][1], ''));
- }
- return \@distances;
- }
- # Compress file
- sub compress_file ($input, $output) {
- open my $fh, '<:raw', $input
- or die "Can't open file <<$input>> for reading: $!";
- my $header = SIGNATURE;
- # Open the output file for writing
- open my $out_fh, '>:raw', $output
- or die "Can't open file <<$output>> for write: $!";
- # Print the header
- print $out_fh $header;
- my $lengths_str = '';
- my $matches_str = '';
- my $uncompressed_str = '';
- my @sizes;
- my @distances_chunk;
- open my $uc_fh, '>:raw', \$uncompressed_str;
- open my $len_fh, '>:raw', \$lengths_str;
- open my $match_fh, '>:raw', \$matches_str;
- my $create_bz2_block = sub {
- scalar(@sizes) > 0 or return;
- print $out_fh COMPRESSED_BYTE;
- print $out_fh delta_encode(\@sizes);
- my ($symbols, $offset_bits) = encode_integers(\@distances_chunk);
- print $out_fh bwt_compress($uncompressed_str);
- print $out_fh bwt_compress($lengths_str);
- print $out_fh bwt_compress($matches_str);
- print $out_fh bwt_compress($symbols);
- print $out_fh bwt_compress($offset_bits);
- @sizes = ();
- @distances_chunk = ();
- open $uc_fh, '>:raw', \$uncompressed_str;
- open $len_fh, '>:raw', \$lengths_str;
- };
- # Compress data
- while (read($fh, (my $chunk), CHUNK_SIZE)) {
- my ($literals, $lengths, $matches, $distances) = lz77_encode($chunk);
- my $est_ratio = length($chunk) / (4 * scalar(@$literals));
- say "Est. ratio: ", $est_ratio, " (", scalar(@$literals), " uncompressed bytes)";
- if ($est_ratio > RANDOM_DATA_THRESHOLD) {
- push(@sizes, scalar(@$literals), scalar(@$lengths), scalar(@$matches), scalar(@$distances));
- print $uc_fh pack('C*', @$literals);
- print $len_fh pack('C*', @$lengths);
- print $match_fh pack('C*', @$matches);
- push @distances_chunk, @$distances;
- }
- else {
- say "Random data detected...";
- $create_bz2_block->();
- print $out_fh UNCOMPRESSED_BYTE;
- print $out_fh create_huffman_entry(string2symbols($chunk));
- }
- if (length($uncompressed_str) >= CHUNK_SIZE) {
- $create_bz2_block->();
- }
- }
- $create_bz2_block->();
- close $out_fh;
- }
- # Decompress file
- sub decompress_file ($input, $output) {
- # Open and validate the input file
- open my $fh, '<:raw', $input
- or die "Can't open file <<$input>> for reading: $!";
- valid_archive($fh) || die "$0: file `$input' is not a \U${\FORMAT}\E v${\VERSION} archive!\n";
- # Open the output file
- open my $out_fh, '>:raw', $output
- or die "Can't open file <<$output>> for writing: $!";
- while (!eof($fh)) {
- my $compression_byte = getc($fh) // die "decompression error";
- if ($compression_byte eq UNCOMPRESSED_BYTE) {
- say "Decoding random data...";
- print $out_fh pack('C*', @{decode_huffman_entry($fh)});
- next;
- }
- elsif ($compression_byte ne COMPRESSED_BYTE) {
- die "decompression error";
- }
- my @sizes = @{delta_decode($fh)};
- my @uncompressed = unpack('C*', bwt_decompress($fh));
- my @lengths = unpack('C*', bwt_decompress($fh));
- my @matches = unpack('C*', bwt_decompress($fh));
- my @symbols = unpack('C*', bwt_decompress($fh));
- my $offset_bits = bwt_decompress($fh);
- open my $offbits_fh, '<:raw', \$offset_bits;
- my @distances = @{decode_integers(\@symbols, $offbits_fh)};
- while (@uncompressed) {
- my $literals_size = shift(@sizes) // die "decompression error";
- my $lengths_size = shift(@sizes) // die "decompression error";
- my $matches_size = shift(@sizes) // die "decompression error";
- my $distances_size = shift(@sizes) // die "decompression error";
- my @uncompressed_chunk = splice(@uncompressed, 0, $literals_size);
- my @lengths_chunk = splice(@lengths, 0, $lengths_size);
- my @matches_chunk = splice(@matches, 0, $matches_size);
- my @distances_chunk = splice(@distances, 0, $distances_size);
- scalar(@uncompressed_chunk) == $literals_size or die "decompression error";
- scalar(@lengths_chunk) == $lengths_size or die "decompression error";
- scalar(@matches_chunk) == $matches_size or die "decompression error";
- scalar(@distances_chunk) == $distances_size or die "decompression error";
- print $out_fh lz77_decode(\@uncompressed_chunk, \@lengths_chunk, \@matches_chunk, \@distances_chunk,);
- }
- }
- close $fh;
- close $out_fh;
- }
- main();
- exit(0);
|