seqgen.c 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. /*
  2. * Copyright (c) 2017-2021, Facebook, Inc.
  3. * All rights reserved.
  4. *
  5. * This source code is licensed under both the BSD-style license (found in the
  6. * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  7. * in the COPYING file in the root directory of this source tree).
  8. * You may select, at your option, one of the above-listed licenses.
  9. */
  10. #include "seqgen.h"
  11. #include "mem.h"
  12. #include <string.h>
  13. #define MIN(a, b) ((a) < (b) ? (a) : (b))
  14. static const size_t kMatchBytes = 128;
  15. #define SEQ_rotl32(x,r) ((x << r) | (x >> (32 - r)))
  16. static BYTE SEQ_randByte(unsigned* src)
  17. {
  18. static const U32 prime1 = 2654435761U;
  19. static const U32 prime2 = 2246822519U;
  20. U32 rand32 = *src;
  21. rand32 *= prime1;
  22. rand32 ^= prime2;
  23. rand32 = SEQ_rotl32(rand32, 13);
  24. *src = rand32;
  25. return (BYTE)(rand32 >> 5);
  26. }
  27. SEQ_stream SEQ_initStream(unsigned seed)
  28. {
  29. SEQ_stream stream;
  30. stream.state = 0;
  31. XXH64_reset(&stream.xxh, 0);
  32. stream.seed = seed;
  33. return stream;
  34. }
  35. /* Generates a single guard byte, then match length + 1 of a different byte,
  36. * then another guard byte.
  37. */
  38. static size_t SEQ_gen_matchLength(SEQ_stream* stream, unsigned value,
  39. SEQ_outBuffer* out)
  40. {
  41. typedef enum {
  42. ml_first_byte = 0,
  43. ml_match_bytes,
  44. ml_last_byte,
  45. } ml_state;
  46. BYTE* const ostart = (BYTE*)out->dst;
  47. BYTE* const oend = ostart + out->size;
  48. BYTE* op = ostart + out->pos;
  49. switch ((ml_state)stream->state) {
  50. case ml_first_byte:
  51. /* Generate a single byte and pick a different byte for the match */
  52. if (op >= oend) {
  53. stream->bytesLeft = 1;
  54. break;
  55. }
  56. *op = SEQ_randByte(&stream->seed) & 0xFF;
  57. do {
  58. stream->saved = SEQ_randByte(&stream->seed) & 0xFF;
  59. } while (*op == stream->saved);
  60. ++op;
  61. /* State transition */
  62. stream->state = ml_match_bytes;
  63. stream->bytesLeft = value + 1;
  64. /* fall-through */
  65. case ml_match_bytes: {
  66. /* Copy matchLength + 1 bytes to the output buffer */
  67. size_t const setLength = MIN(stream->bytesLeft, (size_t)(oend - op));
  68. if (setLength > 0) {
  69. memset(op, stream->saved, setLength);
  70. op += setLength;
  71. stream->bytesLeft -= setLength;
  72. }
  73. if (stream->bytesLeft > 0)
  74. break;
  75. /* State transition */
  76. stream->state = ml_last_byte;
  77. }
  78. /* fall-through */
  79. case ml_last_byte:
  80. /* Generate a single byte and pick a different byte for the match */
  81. if (op >= oend) {
  82. stream->bytesLeft = 1;
  83. break;
  84. }
  85. do {
  86. *op = SEQ_randByte(&stream->seed) & 0xFF;
  87. } while (*op == stream->saved);
  88. ++op;
  89. /* State transition */
  90. /* fall-through */
  91. default:
  92. stream->state = 0;
  93. stream->bytesLeft = 0;
  94. break;
  95. }
  96. XXH64_update(&stream->xxh, ostart + out->pos, (op - ostart) - out->pos);
  97. out->pos = op - ostart;
  98. return stream->bytesLeft;
  99. }
  100. /* Saves the current seed then generates kMatchBytes random bytes >= 128.
  101. * Generates literal length - kMatchBytes random bytes < 128.
  102. * Generates another kMatchBytes using the saved seed to generate a match.
  103. * This way the match is easy to find for the compressors.
  104. */
  105. static size_t SEQ_gen_litLength(SEQ_stream* stream, unsigned value, SEQ_outBuffer* out)
  106. {
  107. typedef enum {
  108. ll_start = 0,
  109. ll_run_bytes,
  110. ll_literals,
  111. ll_run_match,
  112. } ll_state;
  113. BYTE* const ostart = (BYTE*)out->dst;
  114. BYTE* const oend = ostart + out->size;
  115. BYTE* op = ostart + out->pos;
  116. switch ((ll_state)stream->state) {
  117. case ll_start:
  118. stream->state = ll_run_bytes;
  119. stream->saved = stream->seed;
  120. stream->bytesLeft = MIN(kMatchBytes, value);
  121. /* fall-through */
  122. case ll_run_bytes:
  123. while (stream->bytesLeft > 0 && op < oend) {
  124. *op++ = SEQ_randByte(&stream->seed) | 0x80;
  125. --stream->bytesLeft;
  126. }
  127. if (stream->bytesLeft > 0)
  128. break;
  129. /* State transition */
  130. stream->state = ll_literals;
  131. stream->bytesLeft = value - MIN(kMatchBytes, value);
  132. /* fall-through */
  133. case ll_literals:
  134. while (stream->bytesLeft > 0 && op < oend) {
  135. *op++ = SEQ_randByte(&stream->seed) & 0x7F;
  136. --stream->bytesLeft;
  137. }
  138. if (stream->bytesLeft > 0)
  139. break;
  140. /* State transition */
  141. stream->state = ll_run_match;
  142. stream->bytesLeft = MIN(kMatchBytes, value);
  143. /* fall-through */
  144. case ll_run_match: {
  145. while (stream->bytesLeft > 0 && op < oend) {
  146. *op++ = SEQ_randByte(&stream->saved) | 0x80;
  147. --stream->bytesLeft;
  148. }
  149. if (stream->bytesLeft > 0)
  150. break;
  151. }
  152. /* fall-through */
  153. default:
  154. stream->state = 0;
  155. stream->bytesLeft = 0;
  156. break;
  157. }
  158. XXH64_update(&stream->xxh, ostart + out->pos, (op - ostart) - out->pos);
  159. out->pos = op - ostart;
  160. return stream->bytesLeft;
  161. }
  162. /* Saves the current seed then generates kMatchBytes random bytes >= 128.
  163. * Generates offset - kMatchBytes of zeros to get a large offset without
  164. * polluting the hash tables.
  165. * Generates another kMatchBytes using the saved seed to generate a with the
  166. * required offset.
  167. */
  168. static size_t SEQ_gen_offset(SEQ_stream* stream, unsigned value, SEQ_outBuffer* out)
  169. {
  170. typedef enum {
  171. of_start = 0,
  172. of_run_bytes,
  173. of_offset,
  174. of_run_match,
  175. } of_state;
  176. BYTE* const ostart = (BYTE*)out->dst;
  177. BYTE* const oend = ostart + out->size;
  178. BYTE* op = ostart + out->pos;
  179. switch ((of_state)stream->state) {
  180. case of_start:
  181. stream->state = of_run_bytes;
  182. stream->saved = stream->seed;
  183. stream->bytesLeft = MIN(value, kMatchBytes);
  184. /* fall-through */
  185. case of_run_bytes: {
  186. while (stream->bytesLeft > 0 && op < oend) {
  187. *op++ = SEQ_randByte(&stream->seed) | 0x80;
  188. --stream->bytesLeft;
  189. }
  190. if (stream->bytesLeft > 0)
  191. break;
  192. /* State transition */
  193. stream->state = of_offset;
  194. stream->bytesLeft = value - MIN(value, kMatchBytes);
  195. }
  196. /* fall-through */
  197. case of_offset: {
  198. /* Copy matchLength + 1 bytes to the output buffer */
  199. size_t const setLength = MIN(stream->bytesLeft, (size_t)(oend - op));
  200. if (setLength > 0) {
  201. memset(op, 0, setLength);
  202. op += setLength;
  203. stream->bytesLeft -= setLength;
  204. }
  205. if (stream->bytesLeft > 0)
  206. break;
  207. /* State transition */
  208. stream->state = of_run_match;
  209. stream->bytesLeft = MIN(value, kMatchBytes);
  210. }
  211. /* fall-through */
  212. case of_run_match: {
  213. while (stream->bytesLeft > 0 && op < oend) {
  214. *op++ = SEQ_randByte(&stream->saved) | 0x80;
  215. --stream->bytesLeft;
  216. }
  217. if (stream->bytesLeft > 0)
  218. break;
  219. }
  220. /* fall-through */
  221. default:
  222. stream->state = 0;
  223. stream->bytesLeft = 0;
  224. break;
  225. }
  226. XXH64_update(&stream->xxh, ostart + out->pos, (op - ostart) - out->pos);
  227. out->pos = op - ostart;
  228. return stream->bytesLeft;
  229. }
  230. /* Returns the number of bytes left to generate.
  231. * Must pass the same type/value until it returns 0.
  232. */
  233. size_t SEQ_gen(SEQ_stream* stream, SEQ_gen_type type, unsigned value, SEQ_outBuffer* out)
  234. {
  235. switch (type) {
  236. case SEQ_gen_ml: return SEQ_gen_matchLength(stream, value, out);
  237. case SEQ_gen_ll: return SEQ_gen_litLength(stream, value, out);
  238. case SEQ_gen_of: return SEQ_gen_offset(stream, value, out);
  239. case SEQ_gen_max: /* fall-through */
  240. default: return 0;
  241. }
  242. }
  243. /* Returns the xxhash of the data produced so far */
  244. XXH64_hash_t SEQ_digest(SEQ_stream const* stream)
  245. {
  246. return XXH64_digest(&stream->xxh);
  247. }