lzih_file_compression.sf 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. #!/usr/bin/ruby
  2. # Author: Trizen
  3. # Date: 15 December 2022
  4. # Edit: 10 September 2023
  5. # https://github.com/trizen
  6. # Compress/decompress files using LZ77 compression + fixed-width integers encoding + Huffman coding.
  7. define(
  8. CHUNK_SIZE = (1 << 16),
  9. COMPRESSED_BYTE = 1.chr,
  10. UNCOMPRESSED_BYTE = 0.chr,
  11. FORMAT = 'lzih',
  12. SIGNATURE = ('LZIH' + 4.chr),
  13. )
  14. func read_bits (FileHandle fh, Number bits_len) {
  15. var str = ''
  16. fh.read(\str, bits_len>>3)
  17. str = unpack('B*', str)
  18. while (str.len < bits_len) {
  19. str += unpack('B*', fh.getc \\ return nil)
  20. }
  21. if (str.len > bits_len) {
  22. str.substr!(0, bits_len)
  23. }
  24. return str
  25. }
  26. func read_bit (FileHandle fh, Ref bitstring) {
  27. if ((*bitstring \\ '').is_empty) {
  28. *bitstring = unpack('b*', fh.getc \\ die "error")
  29. }
  30. var bit = (*bitstring).substr(-1)
  31. *bitstring = (*bitstring).substr(0, -1)
  32. return bit
  33. }
  34. func delta_encode (Array integers) {
  35. var deltas = [0, integers.len, integers...].diffs
  36. var bitstring = FileHandle.new_buf(:raw)
  37. for d in (deltas) {
  38. if (d == 0) {
  39. bitstring << '0'
  40. }
  41. else {
  42. var t = d.abs.as_bin
  43. bitstring << join('', '1', ((d < 0) ? '0' : '1'), ('1' * (t.len-1)), '0', t.substr(1))
  44. }
  45. }
  46. pack('B*', bitstring.parent)
  47. }
  48. func delta_decode (FileHandle fh) {
  49. var deltas = []
  50. var buffer = ''
  51. var len = 0
  52. for (var k = 0 ; k <= len ; ++k) {
  53. var bit = read_bit(fh, \buffer)
  54. if (bit == '0') {
  55. deltas << 0
  56. }
  57. else {
  58. var bit = read_bit(fh, \buffer)
  59. var n = (^Inf -> first { read_bit(fh, \buffer) != '1' })
  60. var d = Num('1' + n.of { read_bit(fh, \buffer) }.join, 2)
  61. deltas << (bit == '1' ? d : -d)
  62. }
  63. if (k == 0) {
  64. len = deltas.pop
  65. }
  66. }
  67. var acc = [len, deltas...].acc
  68. acc.shift
  69. return acc
  70. }
  71. func encode_integers (Array integers) {
  72. var counts = []
  73. var count = 0
  74. var bits_per_symbol = 2
  75. var processed_len = 0
  76. for k in (integers) {
  77. while (k >= bits_per_symbol) {
  78. if (count > 0) {
  79. counts << [bits_per_symbol.len(2)-1, count]
  80. processed_len += count
  81. }
  82. count = 0
  83. bits_per_symbol *= 2
  84. }
  85. ++count
  86. }
  87. counts << [[bits_per_symbol.len(2)-1, integers.len - processed_len]].grep { .tail > 0 }...
  88. var header = delta_encode([counts.map{.head}..., counts.map{.tail}...]);
  89. var bits = []
  90. for b,len in (counts) {
  91. bits << integers.splice(0, len).map{ sprintf('%0*s', b, .as_bin) }...
  92. }
  93. header + pack('B*', bits.join)
  94. }
  95. func decode_integers (FileHandle str_fh) {
  96. var ints = delta_decode(str_fh)
  97. var half = (ints.len >> 1)
  98. var counts = []
  99. for i in (0 .. (half - 1)) {
  100. counts << [ints[$i], ints[half + i]]
  101. }
  102. var bits_len = 0
  103. for blen,len in (counts) {
  104. bits_len += (blen * len)
  105. }
  106. var chunks = []
  107. var bits = read_bits(str_fh, bits_len)
  108. var bits_fh = bits.open_r(:raw)
  109. for b,len in (counts) {
  110. bits_fh.read(\var chunk, len*b)
  111. chunks << chunk.slices(b)...
  112. }
  113. chunks.map { Num(_, 2) }
  114. }
  115. func walk(Hash n, String s, Hash h) {
  116. if (n.has(:a)) {
  117. h{n{:a}} = s
  118. return nil
  119. }
  120. walk(n{:0}, s+'0', h) if n.has(:0)
  121. walk(n{:1}, s+'1', h) if n.has(:1)
  122. }
  123. func mktree_from_freq(Hash freqs) {
  124. var nodes = freqs.sort_by {|k| k.to_i }.map_2d { |b,f|
  125. Hash(a => b, freq => f)
  126. }
  127. loop {
  128. nodes.sort_by!{|h| h{:freq} }
  129. var(x, y) = (nodes.shift, nodes.shift)
  130. if (defined(x)) {
  131. if (defined(y)) {
  132. nodes << Hash(:0 => x, :1 => y, :freq => x{:freq}+y{:freq})
  133. }
  134. else {
  135. nodes << Hash(:0 => x, :freq => x{:freq})
  136. }
  137. }
  138. nodes.len > 1 || break
  139. }
  140. var n = nodes.first
  141. walk(n, '', n{:tree} = Hash())
  142. return n
  143. }
  144. func huffman_encode(Array bytes, Hash t) {
  145. join('', t{bytes...})
  146. }
  147. func huffman_decode (String bits, Hash tree) {
  148. bits.gsub(Regex('(' + tree.keys.sort_by { .len }.join('|') + ')'), {|s| tree{s} })
  149. }
  150. func create_huffman_entry (Array bytes, FileHandle out_fh) {
  151. var freq = bytes.freq
  152. var h = mktree_from_freq(freq){:tree}
  153. var enc = huffman_encode(bytes, h)
  154. var max_symbol = (bytes.max \\ 0)
  155. var freqs = []
  156. for i in (0 .. max_symbol) {
  157. freqs << (freq{i} \\ 0)
  158. }
  159. out_fh << delta_encode(freqs)
  160. out_fh << pack("N", enc.len)
  161. out_fh << pack("B*", enc)
  162. }
  163. func decode_huffman_entry (FileHandle fh) {
  164. var freqs = delta_decode(fh)
  165. var freq = Hash()
  166. freqs.each_kv {|k,v|
  167. if (v != 0) {
  168. freq{k} = v
  169. }
  170. }
  171. var rev_dict = mktree_from_freq(freq){:tree}.flip
  172. for k in (rev_dict.keys) {
  173. rev_dict{k} = "#{rev_dict{k}} "
  174. }
  175. var enc_len = unpack('N', 4.of { fh.getc }.join).to_i
  176. if (enc_len > 0) {
  177. return huffman_decode(read_bits(fh, enc_len), rev_dict).nums
  178. }
  179. return []
  180. }
  181. func lz77_compression (String str, Array uncompressed, Array indices, Array lengths) {
  182. var la = 0
  183. var prefix = ''
  184. var chars = str.chars
  185. var end = chars.end
  186. while (la <= end) {
  187. var n = 1
  188. var p = prefix.len
  189. var tmp = nil
  190. var token = chars[la]
  191. while ((n < 255) && (la+n <= end) && ((tmp = prefix.rindex(token, p)) >= 0)) {
  192. p = tmp
  193. token += chars[la + n]
  194. ++n
  195. }
  196. --n
  197. indices << la-p
  198. lengths << n
  199. uncompressed << chars[la+n].ord
  200. la += n+1
  201. prefix += token
  202. }
  203. return nil
  204. }
  205. func lz77_decompression (Array uncompressed, Array indices, Array lengths) {
  206. var (fh, str) = FileHandle.new_buf(:raw)
  207. var offset = 0
  208. [uncompressed, indices, lengths].zip {|c,i,l|
  209. fh << (str.substr(offset - i, l), c.chr)
  210. offset += (l + 1)
  211. }
  212. return str
  213. }
  214. # Compress file
  215. func compress_file(File input, File output) {
  216. var in_fh = input.open('<:raw') || die "Can't open file <<#{input}>> for reading"
  217. var out_fh = output.open('>:raw') || die "Can't open file <<#{output}>> for writing"
  218. var header = SIGNATURE
  219. # Print the header
  220. out_fh.print(header)
  221. # Compress data
  222. while (in_fh.read(\var chunk, CHUNK_SIZE)) {
  223. var (uncompressed, indices, lengths) = ([], [], [])
  224. lz77_compression(chunk, uncompressed, indices, lengths)
  225. var est_ratio = (chunk.len / (4 * uncompressed.len))
  226. say (uncompressed.len, " -> ", est_ratio)
  227. if (est_ratio > 0.85) {
  228. out_fh.print(COMPRESSED_BYTE)
  229. create_huffman_entry(uncompressed, out_fh)
  230. create_huffman_entry(lengths, out_fh)
  231. out_fh.print(encode_integers(indices))
  232. }
  233. else {
  234. out_fh.print(UNCOMPRESSED_BYTE)
  235. create_huffman_entry(chunk.bytes, out_fh)
  236. }
  237. }
  238. # Close the files
  239. in_fh.close
  240. out_fh.close
  241. }
  242. # Decompress file
  243. func decompress_file(File input, File output) {
  244. var in_fh = input.open('<:raw') || die "Can't open file <<#{input}>> for reading"
  245. if (SIGNATURE.len.of { in_fh.getc }.join != SIGNATURE) {
  246. die "Not a LZIH archive!\n"
  247. }
  248. var out_fh = output.open('>:raw') || die "Can't open file <<#{output}>> for writing"
  249. while (!in_fh.eof) {
  250. var compression_byte = in_fh.getc
  251. if (compression_byte == COMPRESSED_BYTE) {
  252. var uncompressed = decode_huffman_entry(in_fh)
  253. var lengths = decode_huffman_entry(in_fh)
  254. var indices = decode_integers(in_fh)
  255. out_fh.print(lz77_decompression(uncompressed, indices, lengths))
  256. }
  257. elsif (compression_byte == UNCOMPRESSED_BYTE) {
  258. out_fh.print(decode_huffman_entry(in_fh).pack('C*'))
  259. }
  260. else {
  261. die "Invalid compression..."
  262. }
  263. }
  264. in_fh.close
  265. out_fh.close
  266. }
  267. ARGV.getopt!('d', \var decode)
  268. var file = File(ARGV.shift) || do {
  269. say "usage: #{File(__MAIN__).basename} [-d] [input file]"
  270. Sys.exit(2)
  271. }
  272. if (decode || file.match(Regex('\.' + FORMAT + '\.enc\z'))) {
  273. decompress_file(file, File("output.#{FORMAT}.dec"))
  274. }
  275. else {
  276. compress_file(file, File("output.#{FORMAT}.enc"))
  277. }