lz77.hpp 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. #ifndef SIMPLE_COMPRESS_LZ77_HPP
  2. #define SIMPLE_COMPRESS_LZ77_HPP
  3. #include <type_traits> // std::enable_if_t std::is_same_v
  4. #include <utility> // std::pair
  5. #include <iterator> // std::iterator_traits
  6. #include <cstddef> // std::nullptr_t std::byte
  7. #include <cstring> // std::memcpy
  8. #include <cstdint> // std::uint16_t
  9. #include <limits> // std::numeric_limits
  10. #include "simple/support/algorithm/traits.hpp" // support::is_iterable_v
  11. #include "simple/support/algorithm/utils.hpp" // support::distance
  12. #include "simple/support/algorithm/search.hpp" // support::search_prefix
  13. #include "simple/support/algorithm/mismatch.hpp" // support::mismatch
  14. #include "simple/support/algorithm/copy.hpp" // support::copy_n
  15. #include "simple/support/range.hpp" // support::range
  16. #include "simple/support/type_traits.hpp" // support::is_template_instance_v
  17. #include "bits.hpp" // read_bits
  18. #include "hash_table.hpp" // hash_table
  19. namespace simple::compress
  20. {
  21. template <typename T, typename It>
  22. [[nodiscard]] constexpr T from_bytes(It i)
  23. {
  24. using value_type = typename std::iterator_traits<It>::value_type;
  25. static_assert(sizeof(value_type) <= sizeof(T));
  26. auto count = sizeof(T)/sizeof(value_type);
  27. T result{};
  28. auto dest = reinterpret_cast<std::byte*>(&result);
  29. while(count --> 0)
  30. {
  31. auto&& value = *i;
  32. std::memcpy(dest, &value, sizeof(value_type));
  33. ++i; dest += sizeof(value_type);
  34. }
  35. return result;
  36. }
  37. template <typename It, typename End, typename Map = std::nullptr_t,
  38. std::enable_if_t<support::is_iterable_v<It>>* = nullptr>
  39. [[nodiscard]] constexpr
  40. std::pair<support::range<It>, It> lz77_lookup(It dict, It begin, End end, const Map& map = nullptr)
  41. {
  42. if constexpr (std::is_same_v<Map, std::nullptr_t>)
  43. {
  44. auto match = support::search_prefix(dict, end, begin, end);
  45. auto best_match = match;
  46. while(match.first.end() < begin)
  47. {
  48. match = support::search_prefix(match.first.end(), end, begin, end);
  49. if(support::distance(best_match.first) < support::distance(match.first))
  50. best_match = match;
  51. }
  52. return best_match;
  53. }
  54. else
  55. {
  56. std::pair best_match{support::range{begin,begin}, begin}; // hmm
  57. using key_t = typename Map::key_type;
  58. int lookahead = sizeof(key_t);
  59. if((end-begin) >= lookahead)
  60. {
  61. auto candidates = map.equal_range(from_bytes<key_t>(begin));
  62. for(auto& p : candidates)
  63. {
  64. if(p < dict)
  65. continue;
  66. auto mismatch = support::mismatch(p, end, begin, end);
  67. auto match = std::pair{support::range{p,mismatch.first}, mismatch.second};
  68. if(support::distance(best_match.first) < support::distance(match.first))
  69. best_match = match;
  70. }
  71. }
  72. return best_match;
  73. }
  74. }
  75. template <
  76. // in perfect world this can be derived from difference_type of the iterator, which for data of compile time size/capacity should be a range constrained integer (see boost::safe_numerics)
  77. typename ref_offset_t = std::uint16_t,
  78. typename I = void, typename O = void,
  79. typename hash_table = hash_table<unsigned, I, (1<<16)>>
  80. // TODO enable if trivially copyable I::value_type and hash_table<key_type> or something
  81. // TODO: code path without the hash table
  82. constexpr O lz77_encode(I begin, I end, O o, hash_table&& map = hash_table{}) // TODO: out_end
  83. {
  84. constexpr ref_offset_t ref_offset_max = [](){
  85. auto bits = get_bits(ref_offset_t{});
  86. // do this step by step to prevent unsigned short from freaking exploding into an int TODO: safe unsigned short
  87. bits = ~bits;
  88. bits >>= std::numeric_limits<decltype(bits)>::digits - bit_count(ref_offset_t{});
  89. return bits;
  90. }();
  91. using key_type = typename hash_table::key_type;
  92. constexpr int lookback = sizeof(key_type) -1;
  93. #if defined SIMPLE_SUPPORT_DEBUG_HPP
  94. auto pr = 0;
  95. auto refs = 0;
  96. #endif
  97. // TODO o_end
  98. auto i = begin;
  99. while (i != end)
  100. {
  101. #if defined SIMPLE_SUPPORT_DEBUG_HPP
  102. typename hash_table::stats_t ms{};
  103. if(++pr == (end-begin)/1000)
  104. {
  105. ms = map.stats();
  106. support::print("i: ", i - begin, " p: ", float(i-begin) / (end - begin), " r: ", refs, " ms: ", ms.total_elements, " mb: ", ms.non_empty_rows, " mx: ", ms.max_row_size, " ", '\r');
  107. pr = 0;
  108. }
  109. #endif
  110. auto window = i - begin;
  111. auto dict = window > ref_offset_max ? i - ref_offset_max : begin;
  112. auto match = lz77_lookup(dict, i, end, map);
  113. auto match_size = support::distance(match.first);
  114. if(match_size > 8) // FIXME: parameterize
  115. {
  116. auto pattern_size = i - match.first.begin();
  117. ref_offset_t encodable_pattern_chunk = ref_offset_max / pattern_size * pattern_size;
  118. while(match_size > ref_offset_max)
  119. {
  120. #if defined SIMPLE_SUPPORT_DEBUG_HPP
  121. ++ refs;
  122. #endif
  123. *o = true; ++o;
  124. support::range offset = match.first - dict;
  125. *o = (ref_offset_t)offset.lower(); ++o;
  126. *o = encodable_pattern_chunk; ++o;
  127. match.first.lower() += encodable_pattern_chunk;
  128. match_size -= encodable_pattern_chunk;
  129. dict += encodable_pattern_chunk;
  130. }
  131. #if defined SIMPLE_SUPPORT_DEBUG_HPP
  132. ++ refs;
  133. #endif
  134. *o = true; ++o;
  135. support::range offset = match.first - dict;
  136. *o = (ref_offset_t)offset.lower(); ++o;
  137. *o = (ref_offset_t)(offset.upper() - offset.lower()); ++o;
  138. if((i - begin) >= lookback)
  139. {
  140. key_type key = from_bytes<key_type>(i-lookback);
  141. map.insert(key, i - lookback);
  142. ++i;
  143. auto prev_key{key};
  144. while(i != match.second)
  145. {
  146. key = from_bytes<key_type>(i-lookback);
  147. if(key != prev_key)
  148. map.insert(key, i-lookback);
  149. prev_key = key;
  150. ++i;
  151. }
  152. } else i = match.second;
  153. }
  154. else
  155. {
  156. // write a sym
  157. *o = false; ++o;
  158. *o = *i; ++o;
  159. if((i - begin) >= lookback)
  160. map.insert(from_bytes<key_type>(i - lookback), i - lookback);
  161. ++i;
  162. }
  163. if(map.size() > ref_offset_max*3) // FIXME: magic condition
  164. map.erase_if([dict](auto&& x) {return x < dict;});
  165. }
  166. return o;
  167. };
  168. template <typename ref_offset_t = std::uint16_t, typename I = void, typename O = void,
  169. std::enable_if_t<support::is_template_instance_v<bit_iterator, I>>* = nullptr
  170. >
  171. constexpr auto lz77_decode(I i, O out_begin, O out_end)
  172. {
  173. // FIXME; de-duplicate
  174. constexpr ref_offset_t ref_offset_max = [](){
  175. auto bits = get_bits(ref_offset_t{});
  176. // do this step by step to prevent unsigned short to freaking exploding into an int TODO: safe unsigned short
  177. bits = ~bits;
  178. bits >>= std::numeric_limits<decltype(bits)>::digits - bit_count(ref_offset_t{});
  179. return bits;
  180. }();
  181. auto o = out_begin;
  182. while(o != out_end)
  183. {
  184. bool type{};
  185. i = read_bits(i,type);
  186. if(type)
  187. {
  188. ref_offset_t position{};
  189. ref_offset_t size{};
  190. i = read_bits(i,position);
  191. i = read_bits(i,size);
  192. auto decoded = o - out_begin;
  193. auto dict = decoded > ref_offset_max ? o - ref_offset_max : out_begin;
  194. assert(size <= (out_end - o));
  195. o = support::copy_n(dict+position, size, o).second;
  196. }
  197. else
  198. {
  199. typename std::iterator_traits<O>::value_type c{};
  200. i = read_bits(i,c);
  201. *o = c;
  202. ++o;
  203. }
  204. }
  205. return i;
  206. }
  207. template <typename ref_offset_t = std::uint16_t, typename I = void, typename O = void,
  208. std::enable_if_t<not support::is_template_instance_v<bit_iterator, I>>* = nullptr
  209. >
  210. constexpr auto lz77_decode(I i, O out_begin, O out_end)
  211. {
  212. return lz77_decode<ref_offset_t>(bit_iterator{i,0}, out_begin, out_end);
  213. }
  214. } // namespace simple::compress
  215. #endif /* end of include guard */