huffman.hpp 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. #ifndef SIMPLE_COMPRESS_HUFFMAN_HPP
  2. #define SIMPLE_COMPRESS_HUFFMAN_HPP
  3. #include <iterator> // std::iterator_traits
  4. #include <cstddef> // std::size_t
  5. #include <array> // std::array
  6. #include <vector> // std::vector
  7. #include <utility> // std::pair std::move
  8. #include <limits> // std::numeric_limits
  9. #include <type_traits> // std::enable_if_t std::underlying_type_t std::make_unsigned_t std::is_unsigned_v std::conditional_t std::is_integral_v
  10. #include <cassert> // assert
  11. #include <algorithm> // std::transform std::swap
  12. #include "simple/support/type_traits.hpp" // support::is_template_instance_v
  13. #include "hash_table.hpp" // hash_table
  14. #include "bits.hpp" // bit_count bit_offset get_bits bits read_bits
  15. namespace simple::compress
  16. {
  17. // oof, so painful we still don't have this in std, especially with all the junk that went through over the years
  18. template <typename T, std::size_t Capacity>
  19. class static_vector
  20. {
  21. std::array<T, Capacity> array;
  22. typename std::array<T, Capacity>::iterator next;
  23. public:
  24. constexpr static_vector() : array(), next(array.begin()) {};
  25. constexpr auto begin() { return array.begin(); }
  26. constexpr auto begin() const { return array.begin(); }
  27. constexpr auto end() { return next; }
  28. constexpr auto end() const { return next; }
  29. constexpr bool empty() const
  30. {
  31. return next == array.begin();
  32. }
  33. constexpr bool full() const
  34. {
  35. return next == array.end();
  36. }
  37. constexpr void push_back(T element)
  38. {
  39. *next++ = std::move(element);
  40. }
  41. constexpr void pop_back() { --next; }
  42. constexpr auto& back() { return *next; }
  43. constexpr auto& back() const { return *next; }
  44. };
  45. template <typename SmallKey, typename Value, typename Enabled = void>
  46. class small_table;
  47. template <typename SmallKey, typename Value>
  48. class small_table<SmallKey, Value, std::enable_if_t<bit_count(SmallKey{}) <= 13 && std::is_unsigned_v<SmallKey>>>
  49. {
  50. std::array<Value, 1 << bit_count(SmallKey{})> table{};
  51. static constexpr auto get_bit_value(const SmallKey& key)
  52. {
  53. constexpr auto bc = bit_count(SmallKey{});
  54. constexpr auto bo = bit_offset(SmallKey{});
  55. auto value = get_bits(key);
  56. constexpr auto tc = std::numeric_limits<decltype(value)>::digits;
  57. value >>= tc - (bo + bc);
  58. return value;
  59. }
  60. template <typename Self>
  61. static constexpr auto& get(Self& self, const SmallKey& key)
  62. {
  63. auto offset = get_bit_value(key);
  64. assert(offset < self.table.size());
  65. return self.table[offset];
  66. }
  67. public:
  68. using key_type = SmallKey;
  69. using value_type = Value;
  70. constexpr Value& operator[](const SmallKey& key)
  71. { return get(*this, key); }
  72. constexpr const Value& operator[](const SmallKey& key) const
  73. { return get(*this, key); }
  74. template <typename F>
  75. constexpr void for_each(F&& f) const
  76. {
  77. constexpr auto bc = bit_count(SmallKey{});
  78. constexpr auto bo = bit_offset(SmallKey{});
  79. using key_value = decltype(get_bits(SmallKey{}));
  80. constexpr auto tc = std::numeric_limits<key_value>::digits;
  81. for(std::size_t i = 0; i != table.size(); ++i)
  82. {
  83. auto key = static_cast<key_value>(i);
  84. key <<= tc - bc - bo;
  85. f(std::pair{static_cast<SmallKey>(key), table[i]});
  86. }
  87. }
  88. template <typename F>
  89. // TODO: oof, need proper iterators, also this is used to read codes, and that's slow, prolly better walk the tree bit by bit
  90. constexpr void find_if(F&& f) const
  91. {
  92. constexpr auto bc = bit_count(SmallKey{});
  93. constexpr auto bo = bit_offset(SmallKey{});
  94. using key_value = decltype(get_bits(SmallKey{}));
  95. constexpr auto tc = std::numeric_limits<key_value>::digits;
  96. for(std::size_t i = 0; i != table.size(); ++i)
  97. {
  98. auto key = static_cast<key_value>(i);
  99. key <<= tc - bc - bo;
  100. if(f(std::pair{static_cast<SmallKey>(key), table[i]}))
  101. break;
  102. }
  103. }
  104. };
  105. template <typename BigKey, typename Value>
  106. class small_table<BigKey, Value, std::enable_if_t<(bit_count(BigKey{}) > 13)>>
  107. {
  108. hash_table<BigKey, std::pair<BigKey, Value>, (1<<16)> table;
  109. // TODO
  110. };
  111. template <typename T, typename Enabled = void>
  112. struct underlying_type;
  113. template <typename T>
  114. struct underlying_type<T, std::enable_if_t<std::is_integral_v<T>>>
  115. { using type = T; };
  116. template <typename T>
  117. struct underlying_type<T, std::enable_if_t<not std::is_integral_v<T>>>
  118. { using type = std::underlying_type_t<T>; };
  119. template <typename T>
  120. using underlying_type_t = typename underlying_type<T>::type;
  121. template <typename It>
  122. [[nodiscard]] constexpr auto huffman_code(It begin, It end)
  123. {
  124. using key_type = std::make_unsigned_t<underlying_type_t<typename std::iterator_traits<It>::value_type>>;
  125. small_table<key_type, std::size_t> counter{};
  126. small_table<key_type, bits<>> code{}; // TODO smaller key -> less bits
  127. std::conditional_t<bit_count(key_type{}) <= 13,
  128. static_vector<std::pair<key_type,key_type>, (1 << bit_count(key_type{}))>,
  129. std::vector<std::pair<key_type,key_type>>
  130. > hierarchy{};
  131. for(auto i = begin; i != end; ++i)
  132. ++counter[static_cast<key_type>(*i)];
  133. std::array<std::pair<key_type, std::size_t>, 2> minmin;
  134. while(true)
  135. {
  136. minmin = decltype(minmin){};
  137. // NOTE: this could be partial_sort_copy(no_zeros(counter), minmin, second_less), but can't be bothered to write the necessary iterators atm
  138. counter.for_each([&minmin](auto kv)
  139. {
  140. // filter
  141. if(kv.second != 0)
  142. {
  143. // find a smaller value
  144. if(minmin[1].second == 0 || kv.second < minmin[1].second)
  145. {
  146. minmin[1] = kv;
  147. // keep em sorted
  148. if(minmin[0].second == 0 || minmin[1].second < minmin[0].second)
  149. std::swap(minmin[0], minmin[1]);
  150. }
  151. }
  152. });
  153. if(0 == minmin[1].second)
  154. break;
  155. // FIXME handle code length overflow
  156. code[minmin[0].first].insert(0);
  157. code[minmin[1].first].insert(1);
  158. for(auto& [symbol, parent] : hierarchy)
  159. {
  160. if(parent == minmin[0].first)
  161. {
  162. code[symbol].insert(0);
  163. parent = minmin[0].first;
  164. }
  165. if(parent == minmin[1].first)
  166. {
  167. code[symbol].insert(1);
  168. parent = minmin[0].first;
  169. }
  170. }
  171. counter[minmin[0].first] += counter[minmin[1].first];
  172. counter[minmin[1].first] = 0;
  173. hierarchy.push_back({minmin[1].first, minmin[0].first});
  174. }
  175. // special case: there is only one symbol
  176. if(0 != minmin[0].second && hierarchy.empty())
  177. code[minmin[0].first].insert(0);
  178. return code;
  179. }
  180. template <typename It, typename Out, typename Code>
  181. constexpr auto huffman_encode(const Code& code, It begin, It end, Out out)
  182. {
  183. return std::transform(begin, end, std::move(out),
  184. [&code](auto&& x) { return code[static_cast<typename Code::key_type>(x)]; });
  185. }
  186. template <typename Code, typename I, typename O,
  187. std::enable_if_t<support::is_template_instance_v<bit_iterator, I>>* = nullptr
  188. >
  189. constexpr auto huffman_decode(const Code& code, I i, O out, O out_end)
  190. {
  191. using out_value = typename std::iterator_traits<O>::value_type;
  192. while(out != out_end)
  193. {
  194. code.find_if([&i, &out](auto&& kv)
  195. {
  196. if(bit_count(kv.second) != 0)
  197. {
  198. auto read = kv.second;
  199. auto next = read_bits(i, read);
  200. if(read == kv.second)
  201. {
  202. *out = static_cast<out_value>(kv.first);
  203. i = next;
  204. return true;
  205. }
  206. }
  207. return false;
  208. });
  209. ++out;
  210. }
  211. return i;
  212. }
  213. template <typename Code, typename I, typename O,
  214. std::enable_if_t<not support::is_template_instance_v<bit_iterator, I>>* = nullptr
  215. >
  216. constexpr auto huffman_decode(const Code& code, I i, O out_begin, O out_end)
  217. {
  218. return huffman_decode(code, bit_iterator{i,0}, out_begin, out_end);
  219. }
  220. } // namespace simple::compress
  221. #endif /* end of include guard */