123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243 |
- #ifndef SIMPLE_COMPRESS_LZ77_HPP
- #define SIMPLE_COMPRESS_LZ77_HPP
- #include <type_traits> // std::enable_if_t std::is_same_v
- #include <utility> // std::pair
- #include <iterator> // std::iterator_traits
- #include <cstddef> // std::nullptr_t std::byte
- #include <cstring> // std::memcpy
- #include <cstdint> // std::uint16_t
- #include <limits> // std::numeric_limits
- #include "simple/support/algorithm/traits.hpp" // support::is_iterable_v
- #include "simple/support/algorithm/utils.hpp" // support::distance
- #include "simple/support/algorithm/search.hpp" // support::search_prefix
- #include "simple/support/algorithm/mismatch.hpp" // support::mismatch
- #include "simple/support/algorithm/copy.hpp" // support::copy_n
- #include "simple/support/range.hpp" // support::range
- #include "simple/support/type_traits.hpp" // support::is_template_instance_v
- #include "bits.hpp" // read_bits
- #include "hash_table.hpp" // hash_table
- namespace simple::compress
- {
- template <typename T, typename It>
- [[nodiscard]] constexpr T from_bytes(It i)
- {
- using value_type = typename std::iterator_traits<It>::value_type;
- static_assert(sizeof(value_type) <= sizeof(T));
- auto count = sizeof(T)/sizeof(value_type);
- T result{};
- auto dest = reinterpret_cast<std::byte*>(&result);
- while(count --> 0)
- {
- auto&& value = *i;
- std::memcpy(dest, &value, sizeof(value_type));
- ++i; dest += sizeof(value_type);
- }
- return result;
- }
- template <typename It, typename End, typename Map = std::nullptr_t,
- std::enable_if_t<support::is_iterable_v<It>>* = nullptr>
- [[nodiscard]] constexpr
- std::pair<support::range<It>, It> lz77_lookup(It dict, It begin, End end, const Map& map = nullptr)
- {
- if constexpr (std::is_same_v<Map, std::nullptr_t>)
- {
- auto match = support::search_prefix(dict, end, begin, end);
- auto best_match = match;
- while(match.first.end() < begin)
- {
- match = support::search_prefix(match.first.end(), end, begin, end);
- if(support::distance(best_match.first) < support::distance(match.first))
- best_match = match;
- }
- return best_match;
- }
- else
- {
- std::pair best_match{support::range{begin,begin}, begin}; // hmm
- using key_t = typename Map::key_type;
- int lookahead = sizeof(key_t);
- if((end-begin) >= lookahead)
- {
- auto candidates = map.equal_range(from_bytes<key_t>(begin));
- for(auto& p : candidates)
- {
- if(p < dict)
- continue;
- auto mismatch = support::mismatch(p, end, begin, end);
- auto match = std::pair{support::range{p,mismatch.first}, mismatch.second};
- if(support::distance(best_match.first) < support::distance(match.first))
- best_match = match;
- }
- }
- return best_match;
- }
- }
- template <
- // in perfect world this can be derived from difference_type of the iterator, which for data of compile time size/capacity should be a range constrained integer (see boost::safe_numerics)
- typename ref_offset_t = std::uint16_t,
- typename I = void, typename O = void,
- typename hash_table = hash_table<unsigned, I, (1<<16)>>
- // TODO enable if trivially copyable I::value_type and hash_table<key_type> or something
- // TODO: code path without the hash table
- constexpr O lz77_encode(I begin, I end, O o, hash_table&& map = hash_table{}) // TODO: out_end
- {
- constexpr ref_offset_t ref_offset_max = [](){
- auto bits = get_bits(ref_offset_t{});
- // do this step by step to prevent unsigned short from freaking exploding into an int TODO: safe unsigned short
- bits = ~bits;
- bits >>= std::numeric_limits<decltype(bits)>::digits - bit_count(ref_offset_t{});
- return bits;
- }();
- using key_type = typename hash_table::key_type;
- constexpr int lookback = sizeof(key_type) -1;
- #if defined SIMPLE_SUPPORT_DEBUG_HPP
- auto pr = 0;
- auto refs = 0;
- #endif
- // TODO o_end
- auto i = begin;
- while (i != end)
- {
- #if defined SIMPLE_SUPPORT_DEBUG_HPP
- typename hash_table::stats_t ms{};
- if(++pr == (end-begin)/1000)
- {
- ms = map.stats();
- support::print("i: ", i - begin, " p: ", float(i-begin) / (end - begin), " r: ", refs, " ms: ", ms.total_elements, " mb: ", ms.non_empty_rows, " mx: ", ms.max_row_size, " ", '\r');
- pr = 0;
- }
- #endif
- auto window = i - begin;
- auto dict = window > ref_offset_max ? i - ref_offset_max : begin;
- auto match = lz77_lookup(dict, i, end, map);
- auto match_size = support::distance(match.first);
- if(match_size > 8) // FIXME: parameterize
- {
- auto pattern_size = i - match.first.begin();
- ref_offset_t encodable_pattern_chunk = ref_offset_max / pattern_size * pattern_size;
- while(match_size > ref_offset_max)
- {
- #if defined SIMPLE_SUPPORT_DEBUG_HPP
- ++ refs;
- #endif
- *o = true; ++o;
- support::range offset = match.first - dict;
- *o = (ref_offset_t)offset.lower(); ++o;
- *o = encodable_pattern_chunk; ++o;
- match.first.lower() += encodable_pattern_chunk;
- match_size -= encodable_pattern_chunk;
- dict += encodable_pattern_chunk;
- }
- #if defined SIMPLE_SUPPORT_DEBUG_HPP
- ++ refs;
- #endif
- *o = true; ++o;
- support::range offset = match.first - dict;
- *o = (ref_offset_t)offset.lower(); ++o;
- *o = (ref_offset_t)(offset.upper() - offset.lower()); ++o;
- if((i - begin) >= lookback)
- {
- key_type key = from_bytes<key_type>(i-lookback);
- map.insert(key, i - lookback);
- ++i;
- auto prev_key{key};
- while(i != match.second)
- {
- key = from_bytes<key_type>(i-lookback);
- if(key != prev_key)
- map.insert(key, i-lookback);
- prev_key = key;
- ++i;
- }
- } else i = match.second;
- }
- else
- {
- // write a sym
- *o = false; ++o;
- *o = *i; ++o;
- if((i - begin) >= lookback)
- map.insert(from_bytes<key_type>(i - lookback), i - lookback);
- ++i;
- }
- if(map.size() > ref_offset_max*3) // FIXME: magic condition
- map.erase_if([dict](auto&& x) {return x < dict;});
- }
- return o;
- };
- template <typename ref_offset_t = std::uint16_t, typename I = void, typename O = void,
- std::enable_if_t<support::is_template_instance_v<bit_iterator, I>>* = nullptr
- >
- constexpr auto lz77_decode(I i, O out_begin, O out_end)
- {
- // FIXME; de-duplicate
- constexpr ref_offset_t ref_offset_max = [](){
- auto bits = get_bits(ref_offset_t{});
- // do this step by step to prevent unsigned short to freaking exploding into an int TODO: safe unsigned short
- bits = ~bits;
- bits >>= std::numeric_limits<decltype(bits)>::digits - bit_count(ref_offset_t{});
- return bits;
- }();
- auto o = out_begin;
- while(o != out_end)
- {
- bool type{};
- i = read_bits(i,type);
- if(type)
- {
- ref_offset_t position{};
- ref_offset_t size{};
- i = read_bits(i,position);
- i = read_bits(i,size);
- auto decoded = o - out_begin;
- auto dict = decoded > ref_offset_max ? o - ref_offset_max : out_begin;
- assert(size <= (out_end - o));
- o = support::copy_n(dict+position, size, o).second;
- }
- else
- {
- typename std::iterator_traits<O>::value_type c{};
- i = read_bits(i,c);
- *o = c;
- ++o;
- }
- }
- return i;
- }
- template <typename ref_offset_t = std::uint16_t, typename I = void, typename O = void,
- std::enable_if_t<not support::is_template_instance_v<bit_iterator, I>>* = nullptr
- >
- constexpr auto lz77_decode(I i, O out_begin, O out_end)
- {
- return lz77_decode<ref_offset_t>(bit_iterator{i,0}, out_begin, out_end);
- }
- } // namespace simple::compress
- #endif /* end of include guard */
|