huffman_prime_encoding.sf 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. #!/usr/bin/ruby
  2. # Compress/decompress the prime numbers in a given range, using Huffman coding.
  3. # Arbitrary large ranges are supported.
  4. # See also:
  5. # Efficient storage of all primes up to some n
  6. # https://www.mersenneforum.org/showthread.php?t=24417
  7. # One half of largest prime gap up to 10^n.
  8. # https://oeis.org/A213949
  9. define(
  10. SIGNATURE = ('PHFM' + 1.chr),
  11. )
  12. func read_bit (FileHandle fh, Ref bitstring) {
  13. if ((*bitstring \\ '').is_empty) {
  14. *bitstring = unpack('b*', fh.getc \\ die "error")
  15. }
  16. var bit = (*bitstring).substr(-1)
  17. *bitstring = (*bitstring).substr(0, -1)
  18. return bit
  19. }
  20. func read_bits (FileHandle fh, Number bits_len) {
  21. var str = ''
  22. fh.read(\str, bits_len>>3)
  23. str = unpack('B*', str)
  24. while (str.len < bits_len) {
  25. str += unpack('B*', fh.getc \\ die "error")
  26. }
  27. if (str.len > bits_len) {
  28. str.substr!(0, bits_len)
  29. }
  30. return str
  31. }
  32. func delta_encode (Array integers, Bool double = false) {
  33. var deltas = [0, integers.len, integers...].diffs
  34. var bitstring = FileHandle.new_buf(:raw)
  35. for d in (deltas) {
  36. if (d == 0) {
  37. bitstring << '0'
  38. }
  39. elsif (double) {
  40. var t = d.abs.inc.as_bin
  41. var l = t.len.as_bin
  42. bitstring << join('', '1', ((d < 0) ? '0' : '1'), ('1' * (l.len-1)), '0', l.substr(1), t.substr(1))
  43. }
  44. else {
  45. var t = d.abs.as_bin
  46. bitstring << join('', '1', ((d < 0) ? '0' : '1'), ('1' * (t.len-1)), '0', t.substr(1))
  47. }
  48. }
  49. pack('B*', bitstring.parent)
  50. }
  51. func delta_decode (FileHandle fh, Bool double = false) {
  52. var deltas = []
  53. var buffer = ''
  54. var len = 0
  55. for (var k = 0 ; k <= len ; ++k) {
  56. var bit = read_bit(fh, \buffer)
  57. if (bit == '0') {
  58. deltas << 0
  59. }
  60. elsif (double) {
  61. var bit = read_bit(fh, \buffer)
  62. var bl = (^Inf -> first { read_bit(fh, \buffer) != '1' })
  63. var bl2 = Num('1' + bl.of { read_bit(fh, \buffer) }.join, 2)
  64. var int = Num('1' + (bl2-1).of { read_bit(fh, \buffer) }.join, 2)-1
  65. deltas << (bit == '1' ? int : -int)
  66. }
  67. else {
  68. var bit = read_bit(fh, \buffer)
  69. var n = (^Inf -> first { read_bit(fh, \buffer) != '1' })
  70. var d = Num('1' + n.of { read_bit(fh, \buffer) }.join, 2)
  71. deltas << (bit == '1' ? d : -d)
  72. }
  73. if (k == 0) {
  74. len = deltas.pop
  75. }
  76. }
  77. var acc = [len, deltas...].acc
  78. acc.shift
  79. return acc
  80. }
  81. func walk(Hash n, String s, Hash h) {
  82. if (n.has(:a)) {
  83. h{n{:a}} = s
  84. return nil
  85. }
  86. walk(n{:0}, s+'0', h) if n.has(:0)
  87. walk(n{:1}, s+'1', h) if n.has(:1)
  88. }
  89. func mktree_from_freq(Hash freqs) {
  90. var nodes = freqs.sort_by {|k| k.to_i }.map_2d { |b,f|
  91. Hash(a => b, freq => f)
  92. }
  93. loop {
  94. nodes.sort_by!{|h| h{:freq} }
  95. var(x, y) = (nodes.shift, nodes.shift)
  96. if (defined(x)) {
  97. if (defined(y)) {
  98. nodes << Hash(:0 => x, :1 => y, :freq => x{:freq}+y{:freq})
  99. }
  100. else {
  101. nodes << Hash(:0 => x, :freq => x{:freq})
  102. }
  103. }
  104. nodes.len > 1 || break
  105. }
  106. var n = nodes.first
  107. walk(n, '', n{:tree} = Hash())
  108. return n
  109. }
  110. func huffman_encode(Array bytes, Hash t) {
  111. join('', t{bytes...})
  112. }
  113. func huffman_decode (String bits, Hash tree) {
  114. bits.gsub(Regex('(' + tree.keys.sort_by { .len }.join('|') + ')'), {|s| tree{s} })
  115. }
  116. func create_huffman_entry (Array bytes, FileHandle out_fh) {
  117. var freq = bytes.freq
  118. var h = mktree_from_freq(freq){:tree}
  119. var enc = huffman_encode(bytes, h)
  120. var max_symbol = (bytes.max \\ 0)
  121. say "Max symbol: #{max_symbol}";
  122. var freqs = []
  123. for i in (0 .. max_symbol) {
  124. freqs << (freq{i} \\ 0)
  125. }
  126. out_fh << delta_encode(freqs)
  127. out_fh << pack("N", enc.len)
  128. out_fh << pack("B*", enc)
  129. }
  130. func decode_huffman_entry (FileHandle fh) {
  131. var freqs = delta_decode(fh)
  132. var freq = Hash()
  133. freqs.each_kv {|k,v|
  134. if (v != 0) {
  135. freq{k} = v
  136. }
  137. }
  138. var rev_dict = mktree_from_freq(freq){:tree}.flip
  139. for k in (rev_dict.keys) {
  140. rev_dict{k} = "#{rev_dict{k}} "
  141. }
  142. var enc_len = unpack('N', 4.of { fh.getc }.join).to_i
  143. say "Encoded length: #{enc_len}"
  144. if (enc_len > 0) {
  145. return huffman_decode(read_bits(fh, enc_len), rev_dict).nums
  146. }
  147. return []
  148. }
  149. # Compress file
  150. func huffman_compress_primes(from, to, output) {
  151. var out_fh = output.open('>:raw') || die "Can't open file <<#{output}>> for writing"
  152. var header = SIGNATURE
  153. out_fh << header
  154. out_fh << delta_encode([from])
  155. if (from == 2) {
  156. from = 3
  157. }
  158. var diffs = primes(from, to).diffs.map{ _ >> 1 }
  159. create_huffman_entry(diffs, out_fh)
  160. # Close the files
  161. out_fh.close
  162. }
  163. # Decompress file
  164. func huffman_decompress_primes(File input, File output) {
  165. var in_fh = input.open('<:raw') || die "Can't open file <<#{input}>> for reading"
  166. if (SIGNATURE.len.of { in_fh.getc }.join != SIGNATURE) {
  167. die "Not a HFM archive!\n"
  168. }
  169. var out_fh = output.open('>:raw') || die "Can't open file <<#{output}>> for writing"
  170. var from = delta_decode(in_fh)[0]
  171. var diffs = decode_huffman_entry(in_fh)
  172. say "Initial prime is: #{from}"
  173. say "Number of primes: #{diffs.len + 1 + (from == 2 ? 1 : 0)}"
  174. if (from == 2) {
  175. out_fh.say(2)
  176. from = 3
  177. }
  178. out_fh.say(diffs.map{_ << 1}.unshift(from).acc.join("\n"))
  179. in_fh.close
  180. out_fh.close
  181. }
  182. ARGV.getopt!('d=s', \var decode)
  183. if (decode) {
  184. huffman_decompress_primes(File(decode), File("output.phfm.dec"))
  185. Sys.exit
  186. }
  187. var from = Number(ARGV.shift) || do {
  188. say "usage: #{File(__MAIN__).basename} [-d file] | [from] [upto]"
  189. Sys.exit(2)
  190. }
  191. var upto = Number(ARGV.shift)
  192. if (!upto) {
  193. upto = from
  194. from = 2
  195. }
  196. from = next_prime(from-1)
  197. upto = prev_prime(upto+1)
  198. say "Encoding primes in the range: [#{from}, #{upto}]"
  199. huffman_compress_primes(from, upto, File("output.phfm.enc"))