huffman_codec.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. // Copyright (c) 2017 Google Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // Contains utils for reading, writing and debug printing bit streams.
  15. #ifndef SOURCE_COMP_HUFFMAN_CODEC_H_
  16. #define SOURCE_COMP_HUFFMAN_CODEC_H_
  17. #include <algorithm>
  18. #include <cassert>
  19. #include <functional>
  20. #include <iomanip>
  21. #include <map>
  22. #include <memory>
  23. #include <ostream>
  24. #include <queue>
  25. #include <sstream>
  26. #include <stack>
  27. #include <string>
  28. #include <tuple>
  29. #include <unordered_map>
  30. #include <utility>
  31. #include <vector>
  32. namespace spvtools {
  33. namespace comp {
  34. // Used to generate and apply a Huffman coding scheme.
  35. // |Val| is the type of variable being encoded (for example a string or a
  36. // literal).
  37. template <class Val>
  38. class HuffmanCodec {
  39. public:
  40. // Huffman tree node.
  41. struct Node {
  42. Node() {}
  43. // Creates Node from serialization leaving weight and id undefined.
  44. Node(const Val& in_value, uint32_t in_left, uint32_t in_right)
  45. : value(in_value), left(in_left), right(in_right) {}
  46. Val value = Val();
  47. uint32_t weight = 0;
  48. // Ids are issued sequentially starting from 1. Ids are used as an ordering
  49. // tie-breaker, to make sure that the ordering (and resulting coding scheme)
  50. // are consistent accross multiple platforms.
  51. uint32_t id = 0;
  52. // Handles of children.
  53. uint32_t left = 0;
  54. uint32_t right = 0;
  55. };
  56. // Creates Huffman codec from a histogramm.
  57. // Histogramm counts must not be zero.
  58. explicit HuffmanCodec(const std::map<Val, uint32_t>& hist) {
  59. if (hist.empty()) return;
  60. // Heuristic estimate.
  61. nodes_.reserve(3 * hist.size());
  62. // Create NIL.
  63. CreateNode();
  64. // The queue is sorted in ascending order by weight (or by node id if
  65. // weights are equal).
  66. std::vector<uint32_t> queue_vector;
  67. queue_vector.reserve(hist.size());
  68. std::priority_queue<uint32_t, std::vector<uint32_t>,
  69. std::function<bool(uint32_t, uint32_t)>>
  70. queue(std::bind(&HuffmanCodec::LeftIsBigger, this,
  71. std::placeholders::_1, std::placeholders::_2),
  72. std::move(queue_vector));
  73. // Put all leaves in the queue.
  74. for (const auto& pair : hist) {
  75. const uint32_t node = CreateNode();
  76. MutableValueOf(node) = pair.first;
  77. MutableWeightOf(node) = pair.second;
  78. assert(WeightOf(node));
  79. queue.push(node);
  80. }
  81. // Form the tree by combining two subtrees with the least weight,
  82. // and pushing the root of the new tree in the queue.
  83. while (true) {
  84. // We push a node at the end of each iteration, so the queue is never
  85. // supposed to be empty at this point, unless there are no leaves, but
  86. // that case was already handled.
  87. assert(!queue.empty());
  88. const uint32_t right = queue.top();
  89. queue.pop();
  90. // If the queue is empty at this point, then the last node is
  91. // the root of the complete Huffman tree.
  92. if (queue.empty()) {
  93. root_ = right;
  94. break;
  95. }
  96. const uint32_t left = queue.top();
  97. queue.pop();
  98. // Combine left and right into a new tree and push it into the queue.
  99. const uint32_t parent = CreateNode();
  100. MutableWeightOf(parent) = WeightOf(right) + WeightOf(left);
  101. MutableLeftOf(parent) = left;
  102. MutableRightOf(parent) = right;
  103. queue.push(parent);
  104. }
  105. // Traverse the tree and form encoding table.
  106. CreateEncodingTable();
  107. }
  108. // Creates Huffman codec from saved tree structure.
  109. // |nodes| is the list of nodes of the tree, nodes[0] being NIL.
  110. // |root_handle| is the index of the root node.
  111. HuffmanCodec(uint32_t root_handle, std::vector<Node>&& nodes) {
  112. nodes_ = std::move(nodes);
  113. assert(!nodes_.empty());
  114. assert(root_handle > 0 && root_handle < nodes_.size());
  115. assert(!LeftOf(0) && !RightOf(0));
  116. root_ = root_handle;
  117. // Traverse the tree and form encoding table.
  118. CreateEncodingTable();
  119. }
  120. // Serializes the codec in the following text format:
  121. // (<root_handle>, {
  122. // {0, 0, 0},
  123. // {val1, left1, right1},
  124. // {val2, left2, right2},
  125. // ...
  126. // })
  127. std::string SerializeToText(int indent_num_whitespaces) const {
  128. const bool value_is_text = std::is_same<Val, std::string>::value;
  129. const std::string indent1 = std::string(indent_num_whitespaces, ' ');
  130. const std::string indent2 = std::string(indent_num_whitespaces + 2, ' ');
  131. std::stringstream code;
  132. code << "(" << root_ << ", {\n";
  133. for (const Node& node : nodes_) {
  134. code << indent2 << "{";
  135. if (value_is_text) code << "\"";
  136. code << node.value;
  137. if (value_is_text) code << "\"";
  138. code << ", " << node.left << ", " << node.right << "},\n";
  139. }
  140. code << indent1 << "})";
  141. return code.str();
  142. }
  143. // Prints the Huffman tree in the following format:
  144. // w------w------'x'
  145. // w------'y'
  146. // Where w stands for the weight of the node.
  147. // Right tree branches appear above left branches. Taking the right path
  148. // adds 1 to the code, taking the left adds 0.
  149. void PrintTree(std::ostream& out) const { PrintTreeInternal(out, root_, 0); }
  150. // Traverses the tree and prints the Huffman table: value, code
  151. // and optionally node weight for every leaf.
  152. void PrintTable(std::ostream& out, bool print_weights = true) {
  153. std::queue<std::pair<uint32_t, std::string>> queue;
  154. queue.emplace(root_, "");
  155. while (!queue.empty()) {
  156. const uint32_t node = queue.front().first;
  157. const std::string code = queue.front().second;
  158. queue.pop();
  159. if (!RightOf(node) && !LeftOf(node)) {
  160. out << ValueOf(node);
  161. if (print_weights) out << " " << WeightOf(node);
  162. out << " " << code << std::endl;
  163. } else {
  164. if (LeftOf(node)) queue.emplace(LeftOf(node), code + "0");
  165. if (RightOf(node)) queue.emplace(RightOf(node), code + "1");
  166. }
  167. }
  168. }
  169. // Returns the Huffman table. The table was built at at construction time,
  170. // this function just returns a const reference.
  171. const std::unordered_map<Val, std::pair<uint64_t, size_t>>& GetEncodingTable()
  172. const {
  173. return encoding_table_;
  174. }
  175. // Encodes |val| and stores its Huffman code in the lower |num_bits| of
  176. // |bits|. Returns false of |val| is not in the Huffman table.
  177. bool Encode(const Val& val, uint64_t* bits, size_t* num_bits) const {
  178. auto it = encoding_table_.find(val);
  179. if (it == encoding_table_.end()) return false;
  180. *bits = it->second.first;
  181. *num_bits = it->second.second;
  182. return true;
  183. }
  184. // Reads bits one-by-one using callback |read_bit| until a match is found.
  185. // Matching value is stored in |val|. Returns false if |read_bit| terminates
  186. // before a code was mathced.
  187. // |read_bit| has type bool func(bool* bit). When called, the next bit is
  188. // stored in |bit|. |read_bit| returns false if the stream terminates
  189. // prematurely.
  190. bool DecodeFromStream(const std::function<bool(bool*)>& read_bit,
  191. Val* val) const {
  192. uint32_t node = root_;
  193. while (true) {
  194. assert(node);
  195. if (!RightOf(node) && !LeftOf(node)) {
  196. *val = ValueOf(node);
  197. return true;
  198. }
  199. bool go_right;
  200. if (!read_bit(&go_right)) return false;
  201. if (go_right)
  202. node = RightOf(node);
  203. else
  204. node = LeftOf(node);
  205. }
  206. assert(0);
  207. return false;
  208. }
  209. private:
  210. // Returns value of the node referenced by |handle|.
  211. Val ValueOf(uint32_t node) const { return nodes_.at(node).value; }
  212. // Returns left child of |node|.
  213. uint32_t LeftOf(uint32_t node) const { return nodes_.at(node).left; }
  214. // Returns right child of |node|.
  215. uint32_t RightOf(uint32_t node) const { return nodes_.at(node).right; }
  216. // Returns weight of |node|.
  217. uint32_t WeightOf(uint32_t node) const { return nodes_.at(node).weight; }
  218. // Returns id of |node|.
  219. uint32_t IdOf(uint32_t node) const { return nodes_.at(node).id; }
  220. // Returns mutable reference to value of |node|.
  221. Val& MutableValueOf(uint32_t node) {
  222. assert(node);
  223. return nodes_.at(node).value;
  224. }
  225. // Returns mutable reference to handle of left child of |node|.
  226. uint32_t& MutableLeftOf(uint32_t node) {
  227. assert(node);
  228. return nodes_.at(node).left;
  229. }
  230. // Returns mutable reference to handle of right child of |node|.
  231. uint32_t& MutableRightOf(uint32_t node) {
  232. assert(node);
  233. return nodes_.at(node).right;
  234. }
  235. // Returns mutable reference to weight of |node|.
  236. uint32_t& MutableWeightOf(uint32_t node) { return nodes_.at(node).weight; }
  237. // Returns mutable reference to id of |node|.
  238. uint32_t& MutableIdOf(uint32_t node) { return nodes_.at(node).id; }
  239. // Returns true if |left| has bigger weight than |right|. Node ids are
  240. // used as tie-breaker.
  241. bool LeftIsBigger(uint32_t left, uint32_t right) const {
  242. if (WeightOf(left) == WeightOf(right)) {
  243. assert(IdOf(left) != IdOf(right));
  244. return IdOf(left) > IdOf(right);
  245. }
  246. return WeightOf(left) > WeightOf(right);
  247. }
  248. // Prints subtree (helper function used by PrintTree).
  249. void PrintTreeInternal(std::ostream& out, uint32_t node, size_t depth) const {
  250. if (!node) return;
  251. const size_t kTextFieldWidth = 7;
  252. if (!RightOf(node) && !LeftOf(node)) {
  253. out << ValueOf(node) << std::endl;
  254. } else {
  255. if (RightOf(node)) {
  256. std::stringstream label;
  257. label << std::setfill('-') << std::left << std::setw(kTextFieldWidth)
  258. << WeightOf(RightOf(node));
  259. out << label.str();
  260. PrintTreeInternal(out, RightOf(node), depth + 1);
  261. }
  262. if (LeftOf(node)) {
  263. out << std::string(depth * kTextFieldWidth, ' ');
  264. std::stringstream label;
  265. label << std::setfill('-') << std::left << std::setw(kTextFieldWidth)
  266. << WeightOf(LeftOf(node));
  267. out << label.str();
  268. PrintTreeInternal(out, LeftOf(node), depth + 1);
  269. }
  270. }
  271. }
  272. // Traverses the Huffman tree and saves paths to the leaves as bit
  273. // sequences to encoding_table_.
  274. void CreateEncodingTable() {
  275. struct Context {
  276. Context(uint32_t in_node, uint64_t in_bits, size_t in_depth)
  277. : node(in_node), bits(in_bits), depth(in_depth) {}
  278. uint32_t node;
  279. // Huffman tree depth cannot exceed 64 as histogramm counts are expected
  280. // to be positive and limited by numeric_limits<uint32_t>::max().
  281. // For practical applications tree depth would be much smaller than 64.
  282. uint64_t bits;
  283. size_t depth;
  284. };
  285. std::queue<Context> queue;
  286. queue.emplace(root_, 0, 0);
  287. while (!queue.empty()) {
  288. const Context& context = queue.front();
  289. const uint32_t node = context.node;
  290. const uint64_t bits = context.bits;
  291. const size_t depth = context.depth;
  292. queue.pop();
  293. if (!RightOf(node) && !LeftOf(node)) {
  294. auto insertion_result = encoding_table_.emplace(
  295. ValueOf(node), std::pair<uint64_t, size_t>(bits, depth));
  296. assert(insertion_result.second);
  297. (void)insertion_result;
  298. } else {
  299. if (LeftOf(node)) queue.emplace(LeftOf(node), bits, depth + 1);
  300. if (RightOf(node))
  301. queue.emplace(RightOf(node), bits | (1ULL << depth), depth + 1);
  302. }
  303. }
  304. }
  305. // Creates new Huffman tree node and stores it in the deleter array.
  306. uint32_t CreateNode() {
  307. const uint32_t handle = static_cast<uint32_t>(nodes_.size());
  308. nodes_.emplace_back(Node());
  309. nodes_.back().id = next_node_id_++;
  310. return handle;
  311. }
  312. // Huffman tree root handle.
  313. uint32_t root_ = 0;
  314. // Huffman tree deleter.
  315. std::vector<Node> nodes_;
  316. // Encoding table value -> {bits, num_bits}.
  317. // Huffman codes are expected to never exceed 64 bit length (this is in fact
  318. // impossible if frequencies are stored as uint32_t).
  319. std::unordered_map<Val, std::pair<uint64_t, size_t>> encoding_table_;
  320. // Next node id issued by CreateNode();
  321. uint32_t next_node_id_ = 1;
  322. };
  323. } // namespace comp
  324. } // namespace spvtools
  325. #endif // SOURCE_COMP_HUFFMAN_CODEC_H_