arithmetic_coding_integer.sf 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. #!/usr/bin/ruby
  2. # Author: Trizen
  3. # Date: 04 August 2023
  4. # https://github.com/trizen
  5. # Arithmetic coding, implemented using big integers.
  6. # See also:
  7. # https://en.wikipedia.org/wiki/Arithmetic_coding#Arithmetic_coding_as_a_generalized_change_of_radix
  8. func cumulative_freq (freq) {
  9. var cf = Hash()
  10. var total = 0
  11. freq.keys.sort_by { Num(_) }.each {|c|
  12. cf{c} = total
  13. total += freq{c}
  14. }
  15. return cf
  16. }
  17. func ac_encode (symbols) {
  18. # The frequency characters
  19. var freq = symbols.freq
  20. # Create the cumulative frequency table
  21. var cf = cumulative_freq(freq)
  22. # Limit and base
  23. var base = symbols.len
  24. # Lower bound
  25. var L = 0
  26. # Product of all frequencies
  27. var pf = 1
  28. # Each term is multiplied by the product of the
  29. # frequencies of all previously occurring symbols
  30. for c in (symbols) {
  31. L *= base
  32. L += pf*cf{c}
  33. pf *= freq{c}
  34. }
  35. # Upper bound
  36. var U = (L + pf)
  37. # Compute the power for left shift
  38. var pow = pf.ilog2
  39. # Set enc to (U-1) divided by 2^pow
  40. var enc = ((U - 1) >> pow)
  41. # Remove any divisibility by 2
  42. if (enc>0 && enc.is_even) {
  43. var v = enc.valuation(2)
  44. pow += v
  45. enc >>= v
  46. }
  47. var bin = enc.base(2)
  48. return (bin, pow, freq)
  49. }
  50. func ac_decode (bits, pow, freq) {
  51. # Decode the bits into an integer
  52. var enc = Num(bits, 2)<<pow
  53. # Base value == length of decoded input
  54. var base = freq.values.sum
  55. if (base == 0) {
  56. return []
  57. }
  58. elsif (base == 1) {
  59. return freq.keys
  60. }
  61. # Create the cumulative frequency table
  62. var cf = cumulative_freq(freq)
  63. # Create the dictionary
  64. var dict = Hash()
  65. cf.each {|k,v|
  66. dict{v} = k
  67. }
  68. # Fill the gaps in the dictionary
  69. var lchar = nil
  70. for i in (^base) {
  71. if (dict.has(i)) {
  72. lchar = dict{i}
  73. }
  74. elsif (defined(lchar)) {
  75. dict{i} = lchar
  76. }
  77. }
  78. var dec = []
  79. # Decode the input number
  80. for (var pow = base**(base - 1); pow > 0; pow.idiv!(base)) {
  81. var div = idiv(enc, pow)
  82. var c = dict{div}
  83. var fv = freq{c}
  84. var cv = cf{c}
  85. enc -= pow*cv
  86. enc.idiv!(fv)
  87. dec << c
  88. }
  89. return dec
  90. }
  91. #
  92. ## Run some tests
  93. #
  94. for str in ([
  95. '', 'a', 'this is a message for you to encode and to decode correctly!',
  96. ['a'..'z', 0..9, 'A'..'Z', 0..9].map{.join}.join,
  97. %w(DABDDB DABDDBBDDBA ABBDDD ABRACADABRA CoMpReSSeD Sidef Trizen google TOBEORNOTTOBEORTOBEORNOT)...,
  98. 'In a positional numeral system the radix, or base, is numerically equal to a number of different symbols '+
  99. 'used to express the number. For example, in the decimal system the number of symbols is 10, namely 0, 1, 2, '+
  100. '3, 4, 5, 6, 7, 8, and 9. The radix is used to express any finite integer in a presumed multiplier in polynomial '+
  101. 'form. For example, the number 457 is actually 4×102 + 5×101 + 7×100, where base 10 is presumed but not shown explicitly.'
  102. ]) {
  103. var bytes = str.encode(:utf8).bytes
  104. var (enc, pow, freq) = ac_encode(bytes)
  105. var dec_bytes = ac_decode(enc, pow, freq)
  106. var dec = dec_bytes.decode(:utf8)
  107. say "Encoded: #{enc}"
  108. say "Decoded: #{dec}"
  109. if (str != dec) {
  110. die "\tHowever that is incorrect!"
  111. }
  112. say ("-" * 80)
  113. }