lz77.h 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660
  1. #ifndef __X_LZ77_H
  2. #define __X_LZ77_H
  3. /*
  4. * This code is in the public domain, see: http://unlicense.org/
  5. *
  6. * Feel free to steal it.
  7. */
  8. /*
  9. * This is a variation on the LZ77 compression algorithm.
  10. * It is designed for code simplicity and clarity.
  11. *
  12. * Highlights:
  13. *
  14. * - Portable, self-contained, tiny implementation in readable C++.
  15. * (Header-only, no ifdefs or CPU dependencies or other stupid tricks.)
  16. * - Fast decompression.
  17. * - Pretty good compression quality.
  18. * - Simple 'one-button' API for realistic use-cases.
  19. * - No penalty (only 2 bytes overhead) even when compressing very short strings.
  20. * - Fully streamable decompressor: feed it chunks of arbitrarity-sized data, and the
  21. * original uncompressed buffers will be reconstructed automatically.
  22. *
  23. * Compression performance and quality should be _roughly_ on par with other
  24. * compression algorithms.
  25. *
  26. * (Compression ratio is comparable to other LZ algorithms when at high quality
  27. * settings, compression speed is comparable to gzip. Decompression speed is on par
  28. * with other fast LZ algorithms.)
  29. *
  30. */
  31. /*
  32. * Usage:
  33. #include "lz77.h"
  34. std::string input = ...;
  35. lz77::compress_t compress;
  36. std::string compressed = compress.feed(input);
  37. lz77::decompress_t decompress;
  38. std::string temp;
  39. decompress.feed(compressed, temp);
  40. const std::string& uncompressed = decompress.result();
  41. NOTE: if you're compressing short strings (on the order of a few kilobytes)
  42. then instantiating lz77::compress_t with the arguments (12, 4096) will
  43. give better results.
  44. --------
  45. Use decompress.feed(...) for feeding input data step-by-step in chunks.
  46. For example, if you're trying to decompress a network byte stream:
  47. lz77::decompress_t decompress;
  48. std::string extra;
  49. bool done = decompress.feed(buffer, extra);
  50. while (!done) {
  51. done = decompress.feed(buffer, extra);
  52. }
  53. std::string result = decompress.result();
  54. 'feed' will start (or continue) decoding a packet of compressed data.
  55. If it returns true, then all of the message was decompressed.
  56. If it returns false, then 'feed' needs to be called with more
  57. compressed data until it returns 'true'.
  58. 'extra' will hold any data that was tacked onto the buffer but wasn't
  59. part of this packet of compressed data. (Useful when decoding a message
  60. stream that doesn't have clearly delineated message boundaries; the
  61. decompressor will detect message boundaries properly for you.)
  62. 'extra' will be assigned to only when 'feed' returns true.
  63. 'result' is the decompressed message.
  64. NOTE: calling feed() and result() out of order is undefined
  65. behaviour and might result in crashes.
  66. */
  67. #include <stdexcept>
  68. #include <string>
  69. #include <vector>
  70. #include <string.h>
  71. #include <stdint.h>
  72. namespace lz77 {
  73. // Constants.
  74. // They were chosen by a series of unscientific empirical tests.
  75. enum {
  76. DEFAULT_SEARCHLEN = 12,
  77. DEFAULT_BLOCKSIZE = 64*1024,
  78. SHORTRUN_BITS = 3,
  79. SHORTRUN_MAX = (1 << SHORTRUN_BITS),
  80. MIN_RUN = 3 // 5
  81. };
  82. // Utility function: encode a size_t as a variable-sized stream of octets with 7 bits of useful data.
  83. // (One bit is used to signal an end of stream.)
  84. inline void push_vlq_uint(size_t n, std::string& out) {
  85. while (1) {
  86. unsigned char c = n & 0x7F;
  87. size_t q = n >> 7;
  88. if (q == 0) {
  89. out += c;
  90. break;
  91. }
  92. out += (c | 0x80);
  93. n = q;
  94. }
  95. }
  96. // Utility function: return common prefix length of two strings.
  97. inline size_t substr_run(const unsigned char* ai, const unsigned char* ae,
  98. const unsigned char* bi, const unsigned char* be) {
  99. size_t n = 0;
  100. while (1) {
  101. if (*ai != *bi)
  102. break;
  103. ++n;
  104. ++ai;
  105. ++bi;
  106. if (ai == ae || bi == be)
  107. break;
  108. }
  109. return n;
  110. }
  111. // Utility function: Hash the first MIN_RUN bytes of a string into 16-bit ints.
  112. // (MIN_RUN is a magic constant.)
  113. // The hash function itself is important for compression quality.
  114. // This is the FNV hash, a very very simple and quite good hash algorithm.
  115. inline uint32_t fnv32a(const unsigned char* i, size_t len, uint32_t hash = 0x811c9dc5) {
  116. while (len > 0) {
  117. hash ^= (uint32_t)(*i);
  118. hash *= (uint32_t)0x01000193;
  119. ++i;
  120. --len;
  121. }
  122. return hash;
  123. }
  124. inline void pack_bytes(const unsigned char* i, uint16_t& packed, size_t blocksize) {
  125. uint32_t a = fnv32a(i, MIN_RUN);
  126. packed = a % blocksize;
  127. }
  128. // Compute the profit from compression; 'run' is the length of a string at position 'offset'.
  129. // 'run' and 'offset' are numbers encoded as variable-length bitstreams; the sum length of
  130. // encoded 'run' and 'offset' must be less than 'run'.
  131. inline size_t vlq_length(size_t x) {
  132. size_t ret = 1;
  133. if (x > 0x7F)
  134. ret++;
  135. if (x > 0x3fff)
  136. ret++;
  137. if (x > 0x1fffff)
  138. ret++;
  139. return ret;
  140. }
  141. inline size_t gains(size_t run, size_t offset) {
  142. // Note: this function uses knowledge about the layout of bits in the compressed data format.
  143. size_t gain = run;
  144. offset = offset << (SHORTRUN_BITS + 1);
  145. size_t loss = vlq_length(offset);
  146. if (run >= SHORTRUN_MAX) {
  147. loss += vlq_length(run - MIN_RUN + 1);
  148. }
  149. if (loss > gain)
  150. return 0;
  151. return gain - loss;
  152. }
  153. // Hash table already seen strings; it maps from a hash of a string prefix to
  154. // a list of offsets. (At each offset there is a string with a prefix that hashes
  155. // to the key.)
  156. struct offsets_dict_t {
  157. typedef std::vector<size_t> offsets_t;
  158. offsets_t offsets;
  159. size_t searchlen;
  160. size_t blocksize;
  161. offsets_dict_t(size_t sl, size_t bs) : searchlen(sl), blocksize(bs) {
  162. offsets.resize((searchlen + 1) * blocksize);
  163. }
  164. void clear() {
  165. offsets.assign((searchlen + 1) * blocksize, 0);
  166. }
  167. // Functions for a simple circular buffer data structure.
  168. static size_t* prev(size_t* b, size_t* e, size_t* i) {
  169. if (i == b)
  170. i = e;
  171. --i;
  172. return i;
  173. }
  174. static size_t push_back(size_t* b, size_t* e, size_t* head, size_t val) {
  175. *head = val;
  176. ++head;
  177. if (head == e)
  178. head = b;
  179. return head - b;
  180. }
  181. void operator()(uint16_t packed, const unsigned char* i0, const unsigned char* i, const unsigned char* e,
  182. size_t& maxrun, size_t& maxoffset, size_t& maxgain) {
  183. // Select a range of values representing a circular buffer.
  184. // The first value is the index of the buffer head, the rest are
  185. // the values of the buffer itself.
  186. size_t* cb_start = &offsets[packed * (searchlen + 1)];
  187. size_t* cb_beg = (cb_start + 1);
  188. size_t* cb_end = (cb_start + 1 + searchlen);
  189. size_t* cb_head = cb_beg + *cb_start;
  190. size_t* cb_i = cb_head;
  191. while (1) {
  192. cb_i = prev(cb_beg, cb_end, cb_i);
  193. if (*cb_i == 0)
  194. break;
  195. // The stored value is position + 1 to allow 0 to mean 'uninitialized offset'.
  196. size_t pos = *cb_i - 1;
  197. size_t offset = i - i0 - pos;
  198. size_t run = substr_run(i, e, i0 + pos, e);
  199. size_t gain = gains(run, offset);
  200. if (gain > maxgain) {
  201. maxrun = run;
  202. maxoffset = offset;
  203. maxgain = gain;
  204. }
  205. if (cb_i == cb_head)
  206. break;
  207. }
  208. *cb_start = push_back(cb_beg, cb_end, cb_head, i - i0 + 1);
  209. }
  210. };
  211. /*
  212. *
  213. * Entry point for compression.
  214. *
  215. * Inputs: std::string of data to be compressed.
  216. *
  217. * Also optionally parameters for tuning speed and quality.
  218. *
  219. * There are two parameters: 'searchlen' and 'blocksize'.
  220. *
  221. * 'blocksize' is the upper bound for hash table sizes.
  222. * 'searchlen' is the upper bound for lists of offsets at each hash value.
  223. *
  224. * A larger 'searchlen' increases compression quality, running time and memory consumption.
  225. * A larger 'blocksize' increases memory consumption and compression quality.
  226. *
  227. * If you want faster compression at the expense of quality, try lowering searchlen.
  228. *
  229. * If you only ever compress short strings, try lowering blocksize to save memory.
  230. *
  231. * Output: the compressed data as a string.
  232. */
  233. struct compress_t {
  234. offsets_dict_t offsets;
  235. compress_t(size_t searchlen = DEFAULT_SEARCHLEN, size_t blocksize = DEFAULT_BLOCKSIZE) :
  236. offsets(searchlen, blocksize) {}
  237. std::string feed(const unsigned char* i, const unsigned char* e) {
  238. const unsigned char* i0 = i;
  239. std::string ret;
  240. std::string unc;
  241. push_vlq_uint(e - i, ret);
  242. offsets.clear();
  243. size_t blocksize = offsets.blocksize;
  244. while (i != e) {
  245. unsigned char c = *i;
  246. // The last MIN_RUN-1 bytes are uncompressable. (At least MIN_RUN bytes
  247. // are needed to calculate a prefix hash.)
  248. if (i > e - MIN_RUN) {
  249. unc +=c;
  250. ++i;
  251. continue;
  252. }
  253. size_t maxrun = 0;
  254. size_t maxoffset = 0;
  255. size_t maxgain = 0;
  256. uint16_t packed;
  257. // The MIN_RUN prefix length was chosen empirically, based on a series
  258. // of unscientific tests.
  259. pack_bytes(i, packed, blocksize);
  260. offsets(packed, i0, i, e, maxrun, maxoffset, maxgain);
  261. if (maxrun < MIN_RUN) {
  262. unc += c;
  263. ++i;
  264. continue;
  265. }
  266. if (unc.size() > 0) {
  267. // Write a packet of uncompressed data.
  268. size_t msg = (unc.size() << 1) | 1;
  269. push_vlq_uint(msg, ret);
  270. ret += unc;
  271. unc.clear();
  272. }
  273. // A compressed string is a length and an offset.
  274. // First subtract the minimum length (smaller lengths don't exist).
  275. // Then check if the length fits in SHORTRUN_BITS bits; if it does, then
  276. // tack it on to the offset. Otherwise write length and offset separately.
  277. // The rightmost bit is a zero to differentiate from packets of
  278. // uncompressed data.
  279. i += maxrun;
  280. maxrun = maxrun - MIN_RUN + 1;
  281. if (maxrun < SHORTRUN_MAX) {
  282. size_t msg = ((maxoffset << SHORTRUN_BITS) | maxrun) << 1;
  283. push_vlq_uint(msg, ret);
  284. } else {
  285. size_t msg = (maxoffset << (SHORTRUN_BITS + 1));
  286. push_vlq_uint(msg, ret);
  287. push_vlq_uint(maxrun, ret);
  288. }
  289. }
  290. if (unc.size() > 0) {
  291. size_t msg = (unc.size() << 1) | 1;
  292. push_vlq_uint(msg, ret);
  293. ret += unc;
  294. unc.clear();
  295. }
  296. return ret;
  297. }
  298. std::string feed(const std::string& s) {
  299. const unsigned char* i = (const unsigned char*)s.data();
  300. const unsigned char* e = i + s.size();
  301. return feed(i, e);
  302. }
  303. };
  304. /*
  305. * Entry point for decompression.
  306. * Calling 'feed' and 'result' out of order is undefined behaviour
  307. * and will crash your program.
  308. */
  309. struct decompress_t {
  310. size_t max_size;
  311. std::string ret;
  312. unsigned char* out;
  313. unsigned char* outb;
  314. unsigned char* oute;
  315. struct state_t {
  316. size_t msg;
  317. size_t run;
  318. size_t vlq_num;
  319. size_t vlq_off;
  320. enum {
  321. INIT,
  322. START,
  323. READ_DATA,
  324. READ_RUN
  325. } state;
  326. state_t() : msg(0), run(0), vlq_num(0), vlq_off(0), state(INIT) {}
  327. };
  328. state_t state;
  329. // Utility function: decode variable-length-coded unsigned integers.
  330. bool pop_vlq_uint(const unsigned char*& i, const unsigned char* e, size_t& res) {
  331. while (1) {
  332. if (i == e)
  333. return false;
  334. size_t c = *i;
  335. if ((c & 0x80) == 0) {
  336. state.vlq_num |= (c << state.vlq_off);
  337. break;
  338. }
  339. state.vlq_num |= ((c & 0x7F) << state.vlq_off);
  340. state.vlq_off += 7;
  341. ++i;
  342. }
  343. res = state.vlq_num;
  344. state.vlq_num = 0;
  345. state.vlq_off = 0;
  346. return true;
  347. }
  348. /*
  349. * max_size is the maximum size of decompressed data you're willing to accept.
  350. * This is strictly optional and needed for safety reasons, when you're
  351. * paranoid about accepting data from unknown sources.
  352. *
  353. * The default of 0 means no sanity checking is done.
  354. */
  355. decompress_t(size_t _max_size = 0) : max_size(_max_size), out(NULL), outb(NULL), oute(NULL) {}
  356. /*
  357. * Inputs: the compressed string, as output from 'compress()'.
  358. * Outputs:
  359. * true if all of the data was decompressed.
  360. * false if more input data needs to be fed via 'feed()'.
  361. * 'remaining' will hold input data that wasn't part of
  362. * the compressed message. (Only assigned to when all of
  363. * the data was decompressed.)
  364. */
  365. bool feed(const std::string& s, std::string& remaining) {
  366. const unsigned char* i = (const unsigned char*)s.data();
  367. const unsigned char* e = i + s.size();
  368. return feed(i, e, remaining);
  369. }
  370. bool feed(const unsigned char* i, const unsigned char* e, std::string& remaining) {
  371. // This function is complex because it is streamable and robust.
  372. // The routine checks if the input isn't complete and will properly
  373. // pick up from where we left off when the rest of the input arrives.
  374. if (state.state == state_t::INIT) {
  375. ret.clear();
  376. size_t size;
  377. if (!pop_vlq_uint(i, e, size))
  378. return true;
  379. ++i;
  380. state = state_t();
  381. if (max_size && size > max_size)
  382. throw std::length_error("Uncompressed data in message deemed too large");
  383. ret.resize(size);
  384. outb = (unsigned char*)ret.data();
  385. oute = outb + size;
  386. out = outb;
  387. state.state = state_t::START;
  388. }
  389. while (i != e) {
  390. if (out == oute) {
  391. remaining.assign(i, e);
  392. state.state = state_t::INIT;
  393. return true;
  394. }
  395. if (state.state == state_t::START) {
  396. if (!pop_vlq_uint(i, e, state.msg))
  397. return false;
  398. ++i;
  399. state.state = ((state.msg & 1) ? state_t::READ_DATA : state_t::READ_RUN);
  400. state.msg = state.msg >> 1;
  401. }
  402. if (state.state == state_t::READ_DATA) {
  403. size_t len = state.msg;
  404. if (out + len > oute)
  405. throw std::runtime_error("Malformed data while uncompressing");
  406. if (i == e)
  407. return false;
  408. if (i + len > e) {
  409. size_t l = e - i;
  410. ::memcpy(out, &(*i), l);
  411. out += l;
  412. state.msg -= l;
  413. return false;
  414. }
  415. ::memcpy(out, &(*i), len);
  416. out += len;
  417. i += len;
  418. state.state = state_t::START;
  419. } else if (state.state == state_t::READ_RUN) {
  420. size_t shortrun = state.msg & (SHORTRUN_MAX - 1);
  421. if (shortrun) {
  422. state.run = shortrun;
  423. } else {
  424. if (!pop_vlq_uint(i, e, state.run))
  425. return false;
  426. ++i;
  427. }
  428. size_t off = (state.msg >> SHORTRUN_BITS);
  429. size_t run = state.run + MIN_RUN - 1;
  430. unsigned char* outi = out - off;
  431. if (outi >= oute || outi < outb || out + run > oute || out + run < out)
  432. throw std::runtime_error("Malformed data while uncompressing");
  433. if (outi + run < out) {
  434. ::memcpy(out, outi, run);
  435. out += run;
  436. } else {
  437. while (run > 0) {
  438. *out = *outi;
  439. ++out;
  440. ++outi;
  441. --run;
  442. }
  443. }
  444. state.state = state_t::START;
  445. }
  446. }
  447. if (out == oute) {
  448. remaining.assign(i, e);
  449. state.state = state_t::INIT;
  450. return true;
  451. }
  452. return false;
  453. }
  454. /*
  455. * Returns the uncompressed result.
  456. */
  457. std::string& result() {
  458. return ret;
  459. }
  460. };
  461. }
  462. #endif