123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340 |
- #!/usr/bin/perl
- # Author: Trizen
- # Date: 11 May 2024
- # Edit: 02 June 2024
- # https://github.com/trizen
- # Compress/decompress files using LZ77 compression (LZSS variant with hash tables), using a byte-aligned encoding, similar to LZ4.
- # References:
- # https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md
- # https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md
- use 5.036;
- use Getopt::Std qw(getopts);
- use File::Basename qw(basename);
- use constant {
- PKGNAME => 'LZB2',
- VERSION => '0.01',
- FORMAT => 'lzb2',
- MIN_MATCH_LEN => 4, # minimum match length
- MAX_MATCH_LEN => ~0, # maximum match length
- MAX_MATCH_DIST => (1 << 16) - 1, # maximum match distance
- MAX_CHAIN_LEN => 48, # higher value = better compression
- CHUNK_SIZE => 1 << 18,
- };
- # Container signature
- use constant SIGNATURE => uc(FORMAT) . chr(1);
- sub usage {
- my ($code) = @_;
- print <<"EOH";
- usage: $0 [options] [input file] [output file]
- options:
- -e : extract
- -i <filename> : input filename
- -o <filename> : output filename
- -r : rewrite output
- -v : version number
- -h : this message
- examples:
- $0 document.txt
- $0 document.txt archive.${\FORMAT}
- $0 archive.${\FORMAT} document.txt
- $0 -e -i archive.${\FORMAT} -o document.txt
- EOH
- exit($code // 0);
- }
- sub version {
- printf("%s %s\n", PKGNAME, VERSION);
- exit;
- }
- sub valid_archive {
- my ($fh) = @_;
- if (read($fh, (my $sig), length(SIGNATURE), 0) == length(SIGNATURE)) {
- $sig eq SIGNATURE || return;
- }
- return 1;
- }
- sub main {
- my %opt;
- getopts('ei:o:vhr', \%opt);
- $opt{h} && usage(0);
- $opt{v} && version();
- my ($input, $output) = @ARGV;
- $input //= $opt{i} // usage(2);
- $output //= $opt{o};
- my $ext = qr{\.${\FORMAT}\z}io;
- if ($opt{e} || $input =~ $ext) {
- if (not defined $output) {
- ($output = basename($input)) =~ s{$ext}{}
- || die "$0: no output file specified!\n";
- }
- if (not $opt{r} and -e $output) {
- print "'$output' already exists! -- Replace? [y/N] ";
- <STDIN> =~ /^y/i || exit 17;
- }
- decompress_file($input, $output)
- || die "$0: error: decompression failed!\n";
- }
- elsif ($input !~ $ext || (defined($output) && $output =~ $ext)) {
- $output //= basename($input) . '.' . FORMAT;
- compress_file($input, $output)
- || die "$0: error: compression failed!\n";
- }
- else {
- warn "$0: don't know what to do...\n";
- usage(1);
- }
- }
- sub lzss_encode($str) {
- my $la = 0;
- my @symbols = unpack('C*', $str);
- my $end = $#symbols;
- my $min_len = MIN_MATCH_LEN; # minimum match length
- my $max_len = MAX_MATCH_LEN; # maximum match length
- my $max_dist = MAX_MATCH_DIST; # maximum match distance
- my $max_chain_len = MAX_CHAIN_LEN; # how many recent positions to keep track of
- my (@literals, @distances, @lengths, %table);
- while ($la <= $end) {
- my $best_n = 1;
- my $best_p = $la;
- my $lookahead = substr($str, $la, $min_len);
- if (exists($table{$lookahead})) {
- foreach my $p (@{$table{$lookahead}}) {
- last if ($la - $p > $max_dist);
- my $n = $min_len;
- while ($n <= $max_len and $la + $n <= $end and $symbols[$la + $n - 1] == $symbols[$p + $n - 1]) {
- ++$n;
- }
- if ($n > $best_n) {
- $best_p = $p;
- $best_n = $n;
- }
- }
- my $matched = substr($str, $la, $best_n);
- foreach my $i (0 .. length($matched) - $min_len) {
- my $key = substr($matched, $i, $min_len);
- unshift @{$table{$key}}, $la + $i;
- if (scalar(@{$table{$key}}) > $max_chain_len) {
- pop @{$table{$key}};
- }
- }
- }
- if ($best_n == 1) {
- $table{$lookahead} = [$la];
- }
- if ($best_n > $min_len) {
- push @lengths, $best_n - 1;
- push @distances, $la - $best_p;
- push @literals, undef;
- $la += $best_n - 1;
- }
- else {
- push @lengths, (0) x $best_n;
- push @distances, (0) x $best_n;
- push @literals, @symbols[$best_p .. $best_p + $best_n - 1];
- $la += $best_n;
- }
- }
- return (\@literals, \@distances, \@lengths);
- }
- sub compression($chunk, $out_fh) {
- my ($literals, $distances, $lengths) = lzss_encode($chunk);
- my $literals_end = $#{$literals};
- for (my $i = 0 ; $i <= $literals_end ; ++$i) {
- my @uncompressed;
- while ($i <= $literals_end and defined($literals->[$i])) {
- push @uncompressed, $literals->[$i];
- ++$i;
- }
- my $literals_string = pack('C*', @uncompressed);
- my $literals_length = scalar(@uncompressed);
- my $dist = $distances->[$i] // 0;
- my $match_len = $lengths->[$i] // 0;
- my $len_byte = 0;
- $len_byte |= ($literals_length >= 7 ? 7 : $literals_length) << 5;
- $len_byte |= ($match_len >= 31 ? 31 : $match_len);
- $literals_length -= 7;
- $match_len -= 31;
- print $out_fh chr($len_byte);
- while ($literals_length >= 0) {
- print $out_fh chr($literals_length >= 255 ? 255 : $literals_length);
- $literals_length -= 255;
- }
- print $out_fh $literals_string;
- while ($match_len >= 0) {
- print $out_fh chr($match_len >= 255 ? 255 : $match_len);
- $match_len -= 255;
- }
- if ($dist >= 1 << 16) {
- die "Too large distance: $dist";
- }
- print $out_fh pack('B*', sprintf('%016b', $dist));
- }
- }
- sub decompression($fh, $out_fh) {
- my $search_window = '';
- while (!eof($fh)) {
- my $len_byte = ord(getc($fh));
- my $literals_length = $len_byte >> 5;
- my $match_len = $len_byte & 0b11111;
- if ($literals_length == 7) {
- while (1) {
- my $byte_len = ord(getc($fh));
- $literals_length += $byte_len;
- last if $byte_len != 255;
- }
- }
- my $literals = '';
- if ($literals_length > 0) {
- read($fh, $literals, $literals_length);
- }
- if ($match_len == 31) {
- while (1) {
- my $byte_len = ord(getc($fh));
- $match_len += $byte_len;
- last if $byte_len != 255;
- }
- }
- my $offset = oct('0b' . unpack('B*', getc($fh) . getc($fh)));
- $search_window .= $literals;
- if ($offset == 1) {
- $search_window .= substr($search_window, -1) x $match_len;
- }
- elsif ($offset >= $match_len) { # non-overlapping matches
- $search_window .= substr($search_window, length($search_window) - $offset, $match_len);
- }
- else { # overlapping matches
- foreach my $i (1 .. $match_len) {
- $search_window .= substr($search_window, length($search_window) - $offset, 1);
- }
- }
- print $out_fh substr($search_window, -($match_len + $literals_length));
- $search_window = substr($search_window, -MAX_MATCH_DIST) if (length($search_window) > 2 * MAX_MATCH_DIST);
- }
- }
- # Compress file
- sub compress_file ($input, $output) {
- open my $fh, '<:raw', $input
- or die "Can't open file <<$input>> for reading: $!";
- my $header = SIGNATURE;
- # Open the output file for writing
- open my $out_fh, '>:raw', $output
- or die "Can't open file <<$output>> for write: $!";
- # Print the header
- print $out_fh $header;
- # Compress data
- while (read($fh, (my $chunk), CHUNK_SIZE)) {
- compression($chunk, $out_fh);
- }
- # Close the file
- close $out_fh;
- }
- # Decompress file
- sub decompress_file ($input, $output) {
- # Open and validate the input file
- open my $fh, '<:raw', $input
- or die "Can't open file <<$input>> for reading: $!";
- valid_archive($fh) || die "$0: file `$input' is not a \U${\FORMAT}\E v${\VERSION} archive!\n";
- # Open the output file
- open my $out_fh, '>:raw', $output
- or die "Can't open file <<$output>> for writing: $!";
- while (!eof($fh)) {
- decompression($fh, $out_fh);
- }
- # Close the file
- close $fh;
- close $out_fh;
- }
- main();
- exit(0);
|