123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390 |
- // Copyright (c) 2017 Google Inc.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- // Contains utils for reading, writing and debug printing bit streams.
- #ifndef SOURCE_COMP_HUFFMAN_CODEC_H_
- #define SOURCE_COMP_HUFFMAN_CODEC_H_
- #include <algorithm>
- #include <cassert>
- #include <functional>
- #include <iomanip>
- #include <map>
- #include <memory>
- #include <ostream>
- #include <queue>
- #include <sstream>
- #include <stack>
- #include <string>
- #include <tuple>
- #include <unordered_map>
- #include <utility>
- #include <vector>
- namespace spvtools {
- namespace comp {
- // Used to generate and apply a Huffman coding scheme.
- // |Val| is the type of variable being encoded (for example a string or a
- // literal).
- template <class Val>
- class HuffmanCodec {
- public:
- // Huffman tree node.
- struct Node {
- Node() {}
- // Creates Node from serialization leaving weight and id undefined.
- Node(const Val& in_value, uint32_t in_left, uint32_t in_right)
- : value(in_value), left(in_left), right(in_right) {}
- Val value = Val();
- uint32_t weight = 0;
- // Ids are issued sequentially starting from 1. Ids are used as an ordering
- // tie-breaker, to make sure that the ordering (and resulting coding scheme)
- // are consistent accross multiple platforms.
- uint32_t id = 0;
- // Handles of children.
- uint32_t left = 0;
- uint32_t right = 0;
- };
- // Creates Huffman codec from a histogramm.
- // Histogramm counts must not be zero.
- explicit HuffmanCodec(const std::map<Val, uint32_t>& hist) {
- if (hist.empty()) return;
- // Heuristic estimate.
- nodes_.reserve(3 * hist.size());
- // Create NIL.
- CreateNode();
- // The queue is sorted in ascending order by weight (or by node id if
- // weights are equal).
- std::vector<uint32_t> queue_vector;
- queue_vector.reserve(hist.size());
- std::priority_queue<uint32_t, std::vector<uint32_t>,
- std::function<bool(uint32_t, uint32_t)>>
- queue(std::bind(&HuffmanCodec::LeftIsBigger, this,
- std::placeholders::_1, std::placeholders::_2),
- std::move(queue_vector));
- // Put all leaves in the queue.
- for (const auto& pair : hist) {
- const uint32_t node = CreateNode();
- MutableValueOf(node) = pair.first;
- MutableWeightOf(node) = pair.second;
- assert(WeightOf(node));
- queue.push(node);
- }
- // Form the tree by combining two subtrees with the least weight,
- // and pushing the root of the new tree in the queue.
- while (true) {
- // We push a node at the end of each iteration, so the queue is never
- // supposed to be empty at this point, unless there are no leaves, but
- // that case was already handled.
- assert(!queue.empty());
- const uint32_t right = queue.top();
- queue.pop();
- // If the queue is empty at this point, then the last node is
- // the root of the complete Huffman tree.
- if (queue.empty()) {
- root_ = right;
- break;
- }
- const uint32_t left = queue.top();
- queue.pop();
- // Combine left and right into a new tree and push it into the queue.
- const uint32_t parent = CreateNode();
- MutableWeightOf(parent) = WeightOf(right) + WeightOf(left);
- MutableLeftOf(parent) = left;
- MutableRightOf(parent) = right;
- queue.push(parent);
- }
- // Traverse the tree and form encoding table.
- CreateEncodingTable();
- }
- // Creates Huffman codec from saved tree structure.
- // |nodes| is the list of nodes of the tree, nodes[0] being NIL.
- // |root_handle| is the index of the root node.
- HuffmanCodec(uint32_t root_handle, std::vector<Node>&& nodes) {
- nodes_ = std::move(nodes);
- assert(!nodes_.empty());
- assert(root_handle > 0 && root_handle < nodes_.size());
- assert(!LeftOf(0) && !RightOf(0));
- root_ = root_handle;
- // Traverse the tree and form encoding table.
- CreateEncodingTable();
- }
- // Serializes the codec in the following text format:
- // (<root_handle>, {
- // {0, 0, 0},
- // {val1, left1, right1},
- // {val2, left2, right2},
- // ...
- // })
- std::string SerializeToText(int indent_num_whitespaces) const {
- const bool value_is_text = std::is_same<Val, std::string>::value;
- const std::string indent1 = std::string(indent_num_whitespaces, ' ');
- const std::string indent2 = std::string(indent_num_whitespaces + 2, ' ');
- std::stringstream code;
- code << "(" << root_ << ", {\n";
- for (const Node& node : nodes_) {
- code << indent2 << "{";
- if (value_is_text) code << "\"";
- code << node.value;
- if (value_is_text) code << "\"";
- code << ", " << node.left << ", " << node.right << "},\n";
- }
- code << indent1 << "})";
- return code.str();
- }
- // Prints the Huffman tree in the following format:
- // w------w------'x'
- // w------'y'
- // Where w stands for the weight of the node.
- // Right tree branches appear above left branches. Taking the right path
- // adds 1 to the code, taking the left adds 0.
- void PrintTree(std::ostream& out) const { PrintTreeInternal(out, root_, 0); }
- // Traverses the tree and prints the Huffman table: value, code
- // and optionally node weight for every leaf.
- void PrintTable(std::ostream& out, bool print_weights = true) {
- std::queue<std::pair<uint32_t, std::string>> queue;
- queue.emplace(root_, "");
- while (!queue.empty()) {
- const uint32_t node = queue.front().first;
- const std::string code = queue.front().second;
- queue.pop();
- if (!RightOf(node) && !LeftOf(node)) {
- out << ValueOf(node);
- if (print_weights) out << " " << WeightOf(node);
- out << " " << code << std::endl;
- } else {
- if (LeftOf(node)) queue.emplace(LeftOf(node), code + "0");
- if (RightOf(node)) queue.emplace(RightOf(node), code + "1");
- }
- }
- }
- // Returns the Huffman table. The table was built at at construction time,
- // this function just returns a const reference.
- const std::unordered_map<Val, std::pair<uint64_t, size_t>>& GetEncodingTable()
- const {
- return encoding_table_;
- }
- // Encodes |val| and stores its Huffman code in the lower |num_bits| of
- // |bits|. Returns false of |val| is not in the Huffman table.
- bool Encode(const Val& val, uint64_t* bits, size_t* num_bits) const {
- auto it = encoding_table_.find(val);
- if (it == encoding_table_.end()) return false;
- *bits = it->second.first;
- *num_bits = it->second.second;
- return true;
- }
- // Reads bits one-by-one using callback |read_bit| until a match is found.
- // Matching value is stored in |val|. Returns false if |read_bit| terminates
- // before a code was mathced.
- // |read_bit| has type bool func(bool* bit). When called, the next bit is
- // stored in |bit|. |read_bit| returns false if the stream terminates
- // prematurely.
- bool DecodeFromStream(const std::function<bool(bool*)>& read_bit,
- Val* val) const {
- uint32_t node = root_;
- while (true) {
- assert(node);
- if (!RightOf(node) && !LeftOf(node)) {
- *val = ValueOf(node);
- return true;
- }
- bool go_right;
- if (!read_bit(&go_right)) return false;
- if (go_right)
- node = RightOf(node);
- else
- node = LeftOf(node);
- }
- assert(0);
- return false;
- }
- private:
- // Returns value of the node referenced by |handle|.
- Val ValueOf(uint32_t node) const { return nodes_.at(node).value; }
- // Returns left child of |node|.
- uint32_t LeftOf(uint32_t node) const { return nodes_.at(node).left; }
- // Returns right child of |node|.
- uint32_t RightOf(uint32_t node) const { return nodes_.at(node).right; }
- // Returns weight of |node|.
- uint32_t WeightOf(uint32_t node) const { return nodes_.at(node).weight; }
- // Returns id of |node|.
- uint32_t IdOf(uint32_t node) const { return nodes_.at(node).id; }
- // Returns mutable reference to value of |node|.
- Val& MutableValueOf(uint32_t node) {
- assert(node);
- return nodes_.at(node).value;
- }
- // Returns mutable reference to handle of left child of |node|.
- uint32_t& MutableLeftOf(uint32_t node) {
- assert(node);
- return nodes_.at(node).left;
- }
- // Returns mutable reference to handle of right child of |node|.
- uint32_t& MutableRightOf(uint32_t node) {
- assert(node);
- return nodes_.at(node).right;
- }
- // Returns mutable reference to weight of |node|.
- uint32_t& MutableWeightOf(uint32_t node) { return nodes_.at(node).weight; }
- // Returns mutable reference to id of |node|.
- uint32_t& MutableIdOf(uint32_t node) { return nodes_.at(node).id; }
- // Returns true if |left| has bigger weight than |right|. Node ids are
- // used as tie-breaker.
- bool LeftIsBigger(uint32_t left, uint32_t right) const {
- if (WeightOf(left) == WeightOf(right)) {
- assert(IdOf(left) != IdOf(right));
- return IdOf(left) > IdOf(right);
- }
- return WeightOf(left) > WeightOf(right);
- }
- // Prints subtree (helper function used by PrintTree).
- void PrintTreeInternal(std::ostream& out, uint32_t node, size_t depth) const {
- if (!node) return;
- const size_t kTextFieldWidth = 7;
- if (!RightOf(node) && !LeftOf(node)) {
- out << ValueOf(node) << std::endl;
- } else {
- if (RightOf(node)) {
- std::stringstream label;
- label << std::setfill('-') << std::left << std::setw(kTextFieldWidth)
- << WeightOf(RightOf(node));
- out << label.str();
- PrintTreeInternal(out, RightOf(node), depth + 1);
- }
- if (LeftOf(node)) {
- out << std::string(depth * kTextFieldWidth, ' ');
- std::stringstream label;
- label << std::setfill('-') << std::left << std::setw(kTextFieldWidth)
- << WeightOf(LeftOf(node));
- out << label.str();
- PrintTreeInternal(out, LeftOf(node), depth + 1);
- }
- }
- }
- // Traverses the Huffman tree and saves paths to the leaves as bit
- // sequences to encoding_table_.
- void CreateEncodingTable() {
- struct Context {
- Context(uint32_t in_node, uint64_t in_bits, size_t in_depth)
- : node(in_node), bits(in_bits), depth(in_depth) {}
- uint32_t node;
- // Huffman tree depth cannot exceed 64 as histogramm counts are expected
- // to be positive and limited by numeric_limits<uint32_t>::max().
- // For practical applications tree depth would be much smaller than 64.
- uint64_t bits;
- size_t depth;
- };
- std::queue<Context> queue;
- queue.emplace(root_, 0, 0);
- while (!queue.empty()) {
- const Context& context = queue.front();
- const uint32_t node = context.node;
- const uint64_t bits = context.bits;
- const size_t depth = context.depth;
- queue.pop();
- if (!RightOf(node) && !LeftOf(node)) {
- auto insertion_result = encoding_table_.emplace(
- ValueOf(node), std::pair<uint64_t, size_t>(bits, depth));
- assert(insertion_result.second);
- (void)insertion_result;
- } else {
- if (LeftOf(node)) queue.emplace(LeftOf(node), bits, depth + 1);
- if (RightOf(node))
- queue.emplace(RightOf(node), bits | (1ULL << depth), depth + 1);
- }
- }
- }
- // Creates new Huffman tree node and stores it in the deleter array.
- uint32_t CreateNode() {
- const uint32_t handle = static_cast<uint32_t>(nodes_.size());
- nodes_.emplace_back(Node());
- nodes_.back().id = next_node_id_++;
- return handle;
- }
- // Huffman tree root handle.
- uint32_t root_ = 0;
- // Huffman tree deleter.
- std::vector<Node> nodes_;
- // Encoding table value -> {bits, num_bits}.
- // Huffman codes are expected to never exceed 64 bit length (this is in fact
- // impossible if frequencies are stored as uint32_t).
- std::unordered_map<Val, std::pair<uint64_t, size_t>> encoding_table_;
- // Next node id issued by CreateNode();
- uint32_t next_node_id_ = 1;
- };
- } // namespace comp
- } // namespace spvtools
- #endif // SOURCE_COMP_HUFFMAN_CODEC_H_
|