lzh_file_compression.pl 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. #!/usr/bin/perl
  2. # Author: Trizen
  3. # Date: 15 December 2022
  4. # Edit: 19 March 2024
  5. # https://github.com/trizen
  6. # Compress/decompress files using LZ77 compression + Huffman coding.
  7. use 5.020;
  8. use strict;
  9. use warnings;
  10. use experimental qw(signatures);
  11. use Getopt::Std qw(getopts);
  12. use File::Basename qw(basename);
  13. use List::Util qw(max);
  14. use constant {
  15. PKGNAME => 'LZH',
  16. VERSION => '0.02',
  17. FORMAT => 'lzh',
  18. CHUNK_SIZE => 1 << 16,
  19. };
  20. # Container signature
  21. use constant SIGNATURE => uc(FORMAT) . chr(2);
  22. sub usage {
  23. my ($code) = @_;
  24. print <<"EOH";
  25. usage: $0 [options] [input file] [output file]
  26. options:
  27. -e : extract
  28. -i <filename> : input filename
  29. -o <filename> : output filename
  30. -r : rewrite output
  31. -v : version number
  32. -h : this message
  33. examples:
  34. $0 document.txt
  35. $0 document.txt archive.${\FORMAT}
  36. $0 archive.${\FORMAT} document.txt
  37. $0 -e -i archive.${\FORMAT} -o document.txt
  38. EOH
  39. exit($code // 0);
  40. }
  41. sub version {
  42. printf("%s %s\n", PKGNAME, VERSION);
  43. exit;
  44. }
  45. sub valid_archive {
  46. my ($fh) = @_;
  47. if (read($fh, (my $sig), length(SIGNATURE), 0) == length(SIGNATURE)) {
  48. $sig eq SIGNATURE || return;
  49. }
  50. return 1;
  51. }
  52. sub main {
  53. my %opt;
  54. getopts('ei:o:vhr', \%opt);
  55. $opt{h} && usage(0);
  56. $opt{v} && version();
  57. my ($input, $output) = @ARGV;
  58. $input //= $opt{i} // usage(2);
  59. $output //= $opt{o};
  60. my $ext = qr{\.${\FORMAT}\z}io;
  61. if ($opt{e} || $input =~ $ext) {
  62. if (not defined $output) {
  63. ($output = basename($input)) =~ s{$ext}{}
  64. || die "$0: no output file specified!\n";
  65. }
  66. if (not $opt{r} and -e $output) {
  67. print "'$output' already exists! -- Replace? [y/N] ";
  68. <STDIN> =~ /^y/i || exit 17;
  69. }
  70. decompress_file($input, $output)
  71. || die "$0: error: decompression failed!\n";
  72. }
  73. elsif ($input !~ $ext || (defined($output) && $output =~ $ext)) {
  74. $output //= basename($input) . '.' . FORMAT;
  75. compress_file($input, $output)
  76. || die "$0: error: compression failed!\n";
  77. }
  78. else {
  79. warn "$0: don't know what to do...\n";
  80. usage(1);
  81. }
  82. }
  83. sub read_bit ($fh, $bitstring) {
  84. if (($$bitstring // '') eq '') {
  85. $$bitstring = unpack('b*', getc($fh) // return undef);
  86. }
  87. chop($$bitstring);
  88. }
  89. sub read_bits ($fh, $bits_len) {
  90. my $data = '';
  91. read($fh, $data, $bits_len >> 3);
  92. $data = unpack('B*', $data);
  93. while (length($data) < $bits_len) {
  94. $data .= unpack('B*', getc($fh) // return undef);
  95. }
  96. if (length($data) > $bits_len) {
  97. $data = substr($data, 0, $bits_len);
  98. }
  99. return $data;
  100. }
  101. sub delta_encode ($integers, $double = 0) {
  102. my @deltas;
  103. my $prev = 0;
  104. unshift(@$integers, scalar(@$integers));
  105. while (@$integers) {
  106. my $curr = shift(@$integers);
  107. push @deltas, $curr - $prev;
  108. $prev = $curr;
  109. }
  110. my $bitstring = '';
  111. foreach my $d (@deltas) {
  112. if ($d == 0) {
  113. $bitstring .= '0';
  114. }
  115. elsif ($double) {
  116. my $t = sprintf('%b', abs($d) + 1);
  117. my $l = sprintf('%b', length($t));
  118. $bitstring .= '1' . (($d < 0) ? '0' : '1') . ('1' x (length($l) - 1)) . '0' . substr($l, 1) . substr($t, 1);
  119. }
  120. else {
  121. my $t = sprintf('%b', abs($d));
  122. $bitstring .= '1' . (($d < 0) ? '0' : '1') . ('1' x (length($t) - 1)) . '0' . substr($t, 1);
  123. }
  124. }
  125. pack('B*', $bitstring);
  126. }
  127. sub delta_decode ($fh, $double = 0) {
  128. my @deltas;
  129. my $buffer = '';
  130. my $len = 0;
  131. for (my $k = 0 ; $k <= $len ; ++$k) {
  132. my $bit = read_bit($fh, \$buffer);
  133. if ($bit eq '0') {
  134. push @deltas, 0;
  135. }
  136. elsif ($double) {
  137. my $bit = read_bit($fh, \$buffer);
  138. my $bl = 0;
  139. ++$bl while (read_bit($fh, \$buffer) eq '1');
  140. my $bl2 = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $bl));
  141. my $int = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. ($bl2 - 1)));
  142. push @deltas, ($bit eq '1' ? 1 : -1) * ($int - 1);
  143. }
  144. else {
  145. my $bit = read_bit($fh, \$buffer);
  146. my $n = 0;
  147. ++$n while (read_bit($fh, \$buffer) eq '1');
  148. my $d = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $n));
  149. push @deltas, ($bit eq '1' ? $d : -$d);
  150. }
  151. if ($k == 0) {
  152. $len = pop(@deltas);
  153. }
  154. }
  155. my @acc;
  156. my $prev = $len;
  157. foreach my $d (@deltas) {
  158. $prev += $d;
  159. push @acc, $prev;
  160. }
  161. return \@acc;
  162. }
  163. sub lz77_compression ($str, $uncompressed, $indices, $lengths) {
  164. my $la = 0;
  165. my $prefix = '';
  166. my @chars = split(//, $str);
  167. my $end = $#chars;
  168. while ($la <= $end) {
  169. my $n = 1;
  170. my $p = 0;
  171. my $tmp;
  172. my $token = $chars[$la];
  173. while ( $n < 255
  174. and $la + $n <= $end
  175. and ($tmp = index($prefix, $token, $p)) >= 0) {
  176. $p = $tmp;
  177. $token .= $chars[$la + $n];
  178. ++$n;
  179. }
  180. --$n;
  181. push @$indices, $p;
  182. push @$lengths, $n;
  183. push @$uncompressed, $chars[$la + $n];
  184. $la += $n + 1;
  185. $prefix .= $token;
  186. }
  187. return;
  188. }
  189. sub lz77_decompression ($uncompressed, $indices, $lengths) {
  190. my $ret = '';
  191. my $chunk = '';
  192. foreach my $i (0 .. $#{$uncompressed}) {
  193. $chunk .= substr($chunk, $indices->[$i], $lengths->[$i]) . chr($uncompressed->[$i]);
  194. if (length($chunk) >= CHUNK_SIZE) {
  195. $ret .= $chunk;
  196. $chunk = '';
  197. }
  198. }
  199. if ($chunk ne '') {
  200. $ret .= $chunk;
  201. }
  202. $ret;
  203. }
  204. # produce encode and decode dictionary from a tree
  205. sub walk ($node, $code, $h, $rev_h) {
  206. my $c = $node->[0] // return ($h, $rev_h);
  207. if (ref $c) { walk($c->[$_], $code . $_, $h, $rev_h) for ('0', '1') }
  208. else { $h->{$c} = $code; $rev_h->{$code} = $c }
  209. return ($h, $rev_h);
  210. }
  211. # make a tree, and return resulting dictionaries
  212. sub mktree_from_freq ($freq) {
  213. my @nodes = map { [$_, $freq->{$_}] } sort { $a <=> $b } keys %$freq;
  214. do { # poor man's priority queue
  215. @nodes = sort { $a->[1] <=> $b->[1] } @nodes;
  216. my ($x, $y) = splice(@nodes, 0, 2);
  217. if (defined($x)) {
  218. if (defined($y)) {
  219. push @nodes, [[$x, $y], $x->[1] + $y->[1]];
  220. }
  221. else {
  222. push @nodes, [[$x], $x->[1]];
  223. }
  224. }
  225. } while (@nodes > 1);
  226. walk($nodes[0], '', {}, {});
  227. }
  228. sub huffman_encode ($bytes, $dict) {
  229. join('', @{$dict}{@$bytes});
  230. }
  231. sub huffman_decode ($bits, $hash) {
  232. local $" = '|';
  233. [split(' ', $bits =~ s/(@{[sort { length($a) <=> length($b) } keys %{$hash}]})/$hash->{$1} /gr)]; # very fast
  234. }
  235. sub create_huffman_entry ($bytes, $out_fh) {
  236. my %freq;
  237. ++$freq{$_} for @$bytes;
  238. my ($h, $rev_h) = mktree_from_freq(\%freq);
  239. my $enc = huffman_encode($bytes, $h);
  240. my $max_symbol = max(keys %freq) // 0;
  241. say "Max symbol: $max_symbol";
  242. my @freqs;
  243. foreach my $i (0 .. $max_symbol) {
  244. push @freqs, $freq{$i} // 0;
  245. }
  246. print $out_fh delta_encode(\@freqs);
  247. print $out_fh pack("N", length($enc));
  248. print $out_fh pack("B*", $enc);
  249. }
  250. sub decode_huffman_entry ($fh) {
  251. my @freqs = @{delta_decode($fh)};
  252. my %freq;
  253. foreach my $i (0 .. $#freqs) {
  254. if ($freqs[$i]) {
  255. $freq{$i} = $freqs[$i];
  256. }
  257. }
  258. my (undef, $rev_dict) = mktree_from_freq(\%freq);
  259. my $enc_len = unpack('N', join('', map { getc($fh) // die "error" } 1 .. 4));
  260. if ($enc_len > 0) {
  261. return huffman_decode(read_bits($fh, $enc_len), $rev_dict);
  262. }
  263. return [];
  264. }
  265. # Compress file
  266. sub compress_file ($input, $output) {
  267. open my $fh, '<:raw', $input
  268. or die "Can't open file <<$input>> for reading: $!";
  269. my $header = SIGNATURE;
  270. # Open the output file for writing
  271. open my $out_fh, '>:raw', $output
  272. or die "Can't open file <<$output>> for write: $!";
  273. # Print the header
  274. print $out_fh $header;
  275. my (@uncompressed, @indices, @lengths);
  276. # Compress data
  277. while (read($fh, (my $chunk), CHUNK_SIZE)) {
  278. lz77_compression($chunk, \@uncompressed, \@indices, \@lengths);
  279. }
  280. @indices = unpack('C*', pack('S*', @indices));
  281. @uncompressed = unpack('C*', join('', @uncompressed));
  282. create_huffman_entry(\@uncompressed, $out_fh);
  283. create_huffman_entry(\@indices, $out_fh);
  284. create_huffman_entry(\@lengths, $out_fh);
  285. # Close the file
  286. close $out_fh;
  287. }
  288. # Decompress file
  289. sub decompress_file ($input, $output) {
  290. # Open and validate the input file
  291. open my $fh, '<:raw', $input
  292. or die "Can't open file <<$input>> for reading: $!";
  293. valid_archive($fh) || die "$0: file `$input' is not a \U${\FORMAT}\E v${\VERSION} archive!\n";
  294. # Open the output file
  295. open my $out_fh, '>:raw', $output
  296. or die "Can't open file <<$output>> for writing: $!";
  297. my $uncompressed = decode_huffman_entry($fh);
  298. my @indices = unpack('S*', pack('C*', @{decode_huffman_entry($fh)}));
  299. my $lengths = decode_huffman_entry($fh);
  300. print $out_fh lz77_decompression($uncompressed, \@indices, $lengths);
  301. # Close the file
  302. close $fh;
  303. close $out_fh;
  304. }
  305. main();
  306. exit(0);