lzbw_file_compression.pl 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 05 September 2023
  4. # Edit: 11 April 2024
  5. # https://github.com/trizen
  6. # Compress/decompress files using LZ77 compression + fixed-width integers encoding + Burrows-Wheeler Transform (BWT) + Huffman coding.
  7. # References:
  8. # Data Compression (Summer 2023) - Lecture 13 - BZip2
  9. # https://youtube.com/watch?v=cvoZbBZ3M2A
  10. use 5.036;
  11. use Getopt::Std qw(getopts);
  12. use File::Basename qw(basename);
  13. use Compression::Util qw(:all);
  14. use constant {
  15. PKGNAME => 'LZBW',
  16. VERSION => '0.01',
  17. FORMAT => 'lzbw',
  18. COMPRESSED_BYTE => chr(1),
  19. UNCOMPRESSED_BYTE => chr(0),
  20. CHUNK_SIZE => 1 << 16, # higher value = better compression
  21. RANDOM_DATA_THRESHOLD => 1, # in ratio
  22. };
  23. # Container signature
  24. use constant SIGNATURE => uc(FORMAT) . chr(1);
  25. sub usage {
  26. my ($code) = @_;
  27. print <<"EOH";
  28. usage: $0 [options] [input file] [output file]
  29. options:
  30. -e : extract
  31. -i <filename> : input filename
  32. -o <filename> : output filename
  33. -r : rewrite output
  34. -v : version number
  35. -h : this message
  36. examples:
  37. $0 document.txt
  38. $0 document.txt archive.${\FORMAT}
  39. $0 archive.${\FORMAT} document.txt
  40. $0 -e -i archive.${\FORMAT} -o document.txt
  41. EOH
  42. exit($code // 0);
  43. }
  44. sub version {
  45. printf("%s %s\n", PKGNAME, VERSION);
  46. exit;
  47. }
  48. sub valid_archive {
  49. my ($fh) = @_;
  50. if (read($fh, (my $sig), length(SIGNATURE), 0) == length(SIGNATURE)) {
  51. $sig eq SIGNATURE || return;
  52. }
  53. return 1;
  54. }
  55. sub main {
  56. my %opt;
  57. getopts('ei:o:vhr', \%opt);
  58. $opt{h} && usage(0);
  59. $opt{v} && version();
  60. my ($input, $output) = @ARGV;
  61. $input //= $opt{i} // usage(2);
  62. $output //= $opt{o};
  63. my $ext = qr{\.${\FORMAT}\z}io;
  64. if ($opt{e} || $input =~ $ext) {
  65. if (not defined $output) {
  66. ($output = basename($input)) =~ s{$ext}{}
  67. || die "$0: no output file specified!\n";
  68. }
  69. if (not $opt{r} and -e $output) {
  70. print "'$output' already exists! -- Replace? [y/N] ";
  71. <STDIN> =~ /^y/i || exit 17;
  72. }
  73. decompress_file($input, $output)
  74. || die "$0: error: decompression failed!\n";
  75. }
  76. elsif ($input !~ $ext || (defined($output) && $output =~ $ext)) {
  77. $output //= basename($input) . '.' . FORMAT;
  78. compress_file($input, $output)
  79. || die "$0: error: compression failed!\n";
  80. }
  81. else {
  82. warn "$0: don't know what to do...\n";
  83. usage(1);
  84. }
  85. }
  86. # Compress file
  87. sub compress_file ($input, $output) {
  88. open my $fh, '<:raw', $input
  89. or die "Can't open file <<$input>> for reading: $!";
  90. my $header = SIGNATURE;
  91. # Open the output file for writing
  92. open my $out_fh, '>:raw', $output
  93. or die "Can't open file <<$output>> for write: $!";
  94. # Print the header
  95. print $out_fh $header;
  96. my $lengths_str = '';
  97. my $matches_str = '';
  98. my $uncompressed_str = '';
  99. my @sizes;
  100. my @distances_block;
  101. open my $uc_fh, '>:raw', \$uncompressed_str;
  102. open my $len_fh, '>:raw', \$lengths_str;
  103. open my $match_fh, '>:raw', \$matches_str;
  104. my $create_bz2_block = sub {
  105. scalar(@sizes) > 0 or return;
  106. print $out_fh COMPRESSED_BYTE;
  107. print $out_fh delta_encode(\@sizes);
  108. print $out_fh bwt_compress($uncompressed_str);
  109. print $out_fh bwt_compress($lengths_str);
  110. print $out_fh bwt_compress($matches_str);
  111. print $out_fh bwt_compress(abc_encode(\@distances_block));
  112. @sizes = ();
  113. @distances_block = ();
  114. open $uc_fh, '>:raw', \$uncompressed_str;
  115. open $len_fh, '>:raw', \$lengths_str;
  116. };
  117. # Compress data
  118. while (read($fh, (my $chunk), CHUNK_SIZE)) {
  119. my ($literals, $lengths, $matches, $distances) = lz77_encode($chunk);
  120. my $est_ratio = length($chunk) / (4 * scalar(@$literals));
  121. say "Est. ratio: ", $est_ratio, " (", scalar(@$literals), " uncompressed bytes)";
  122. if ($est_ratio > RANDOM_DATA_THRESHOLD) {
  123. push(@sizes, scalar(@$literals), scalar(@$lengths), scalar(@$matches), scalar(@$distances));
  124. print $uc_fh pack('C*', @$literals);
  125. print $len_fh pack('C*', @$lengths);
  126. print $match_fh pack('C*', @$matches);
  127. push @distances_block, @$distances;
  128. }
  129. else {
  130. say "Random data detected...";
  131. $create_bz2_block->();
  132. print $out_fh UNCOMPRESSED_BYTE;
  133. print $out_fh create_huffman_entry(string2symbols($chunk));
  134. }
  135. if (length($uncompressed_str) >= CHUNK_SIZE) {
  136. $create_bz2_block->();
  137. }
  138. }
  139. $create_bz2_block->();
  140. close $out_fh;
  141. }
  142. # Decompress file
  143. sub decompress_file ($input, $output) {
  144. # Open and validate the input file
  145. open my $fh, '<:raw', $input
  146. or die "Can't open file <<$input>> for reading: $!";
  147. valid_archive($fh) || die "$0: file `$input' is not a \U${\FORMAT}\E v${\VERSION} archive!\n";
  148. # Open the output file
  149. open my $out_fh, '>:raw', $output
  150. or die "Can't open file <<$output>> for writing: $!";
  151. while (!eof($fh)) {
  152. my $compression_byte = getc($fh) // die "decompression error";
  153. if ($compression_byte eq UNCOMPRESSED_BYTE) {
  154. say "Decoding random data...";
  155. print $out_fh pack('C*', @{decode_huffman_entry($fh)});
  156. next;
  157. }
  158. elsif ($compression_byte ne COMPRESSED_BYTE) {
  159. die "decompression error";
  160. }
  161. my @sizes = @{delta_decode($fh)};
  162. my @uncompressed = unpack('C*', bwt_decompress($fh));
  163. my @lengths = unpack('C*', bwt_decompress($fh));
  164. my @matches = unpack('C*', bwt_decompress($fh));
  165. my @distances = @{abc_decode(bwt_decompress($fh))};
  166. while (@uncompressed) {
  167. my $literals_size = shift(@sizes) // die "decompression error";
  168. my $lengths_size = shift(@sizes) // die "decompression error";
  169. my $matches_size = shift(@sizes) // die "decompression error";
  170. my $distances_size = shift(@sizes) // die "decompression error";
  171. my @uncompressed_chunk = splice(@uncompressed, 0, $literals_size);
  172. my @lengths_chunk = splice(@lengths, 0, $lengths_size);
  173. my @matches_chunk = splice(@matches, 0, $matches_size);
  174. my @distances_chunk = splice(@distances, 0, $distances_size);
  175. scalar(@uncompressed_chunk) == $literals_size or die "decompression error";
  176. scalar(@lengths_chunk) == $lengths_size or die "decompression error";
  177. scalar(@matches_chunk) == $matches_size or die "decompression error";
  178. scalar(@distances_chunk) == $distances_size or die "decompression error";
  179. print $out_fh lz77_decode(\@uncompressed_chunk, \@lengths_chunk, \@matches_chunk, \@distances_chunk,);
  180. }
  181. }
  182. close $fh;
  183. close $out_fh;
  184. }
  185. main();
  186. exit(0);