stream_codec.hpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. /**
  2. * Copyright (C) 2015 Topology LP
  3. * Copyright (C) 2018 Jakob Petsovits
  4. * All rights reserved.
  5. *
  6. * Permission is hereby granted, free of charge, to any person obtaining a copy
  7. * of this software and associated documentation files (the "Software"), to
  8. * deal in the Software without restriction, including without limitation the
  9. * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10. * sell copies of the Software, and to permit persons to whom the Software is
  11. * furnished to do so, subject to the following conditions:
  12. *
  13. * The above copyright notice and this permission notice shall be included in
  14. * all copies or substantial portions of the Software.
  15. *
  16. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22. * IN THE SOFTWARE.
  23. */
  24. #ifndef CPPCODEC_DETAIL_STREAM_CODEC
  25. #define CPPCODEC_DETAIL_STREAM_CODEC
  26. #include <limits>
  27. #include <stdlib.h> // for abort()
  28. #include <stdint.h>
  29. #include "../parse_error.hpp"
  30. #include "config.hpp"
  31. namespace cppcodec {
  32. namespace detail {
  33. using alphabet_index_t = uint_fast16_t;
  34. template <typename Codec, typename CodecVariant>
  35. class stream_codec
  36. {
  37. public:
  38. template <typename Result, typename ResultState> static void encode(
  39. Result& encoded_result, ResultState&, const uint8_t* binary, size_t binary_size);
  40. template <typename Result, typename ResultState> static void decode(
  41. Result& binary_result, ResultState&, const char* encoded, size_t encoded_size);
  42. static constexpr size_t encoded_size(size_t binary_size) noexcept;
  43. static constexpr size_t decoded_max_size(size_t encoded_size) noexcept;
  44. };
  45. template <bool GeneratesPadding> // default for CodecVariant::generates_padding() == false
  46. struct padder {
  47. template <typename CodecVariant, typename Result, typename ResultState, typename EncodedBlockSizeT>
  48. CPPCODEC_ALWAYS_INLINE void operator()(Result&, ResultState&, EncodedBlockSizeT) { }
  49. };
  50. /*
  51. * Почему закомментировано?
  52. template<> // specialization for CodecVariant::generates_padding() == true
  53. struct padder<true> {
  54. template <typename CodecVariant, typename Result, typename ResultState, typename EncodedBlockSizeT>
  55. CPPCODEC_ALWAYS_INLINE void operator()(
  56. Result& encoded, ResultState& state, EncodedBlockSizeT num_padding_characters)
  57. {
  58. for (EncodedBlockSizeT i = 0; i < num_padding_characters; ++i) {
  59. data::put(encoded, state, CodecVariant::padding_symbol());
  60. }
  61. }
  62. };
  63. * Потому что паддинги (знаки '=') в конце мешнейма нам не нужны.
  64. * И вообще, как ты добрался сюда?
  65. * С любовью из 2020, acetone.
  66. */
  67. template <size_t I>
  68. struct enc {
  69. // Block encoding: Go from 0 to (block size - 1), append a symbol for each iteration unconditionally.
  70. template <typename Codec, typename CodecVariant, typename Result, typename ResultState>
  71. static CPPCODEC_ALWAYS_INLINE void block(Result& encoded, ResultState& state, const uint8_t* src)
  72. {
  73. using EncodedBlockSizeT = decltype(Codec::encoded_block_size());
  74. constexpr static const EncodedBlockSizeT SymbolIndex = static_cast<EncodedBlockSizeT>(I - 1);
  75. enc<I - 1>().template block<Codec, CodecVariant>(encoded, state, src);
  76. data::put(encoded, state, CodecVariant::symbol(Codec::template index<SymbolIndex>(src)));
  77. }
  78. // Tail encoding: Go from 0 until (runtime) num_symbols, append a symbol for each iteration.
  79. template <typename Codec, typename CodecVariant, typename Result, typename ResultState,
  80. typename EncodedBlockSizeT = decltype(Codec::encoded_block_size())>
  81. static CPPCODEC_ALWAYS_INLINE void tail(
  82. Result& encoded, ResultState& state, const uint8_t* src, EncodedBlockSizeT num_symbols)
  83. {
  84. constexpr static const EncodedBlockSizeT SymbolIndex = Codec::encoded_block_size() - I;
  85. constexpr static const EncodedBlockSizeT NumSymbols = SymbolIndex + static_cast<EncodedBlockSizeT>(1);
  86. if (num_symbols == NumSymbols) {
  87. data::put(encoded, state, CodecVariant::symbol(Codec::template index_last<SymbolIndex>(src)));
  88. padder<CodecVariant::generates_padding()> pad;
  89. #ifdef _MSC_VER
  90. pad.operator()<CodecVariant>(encoded, state, Codec::encoded_block_size() - NumSymbols);
  91. #else
  92. pad.template operator()<CodecVariant>(encoded, state, Codec::encoded_block_size() - NumSymbols);
  93. #endif
  94. return;
  95. }
  96. data::put(encoded, state, CodecVariant::symbol(Codec::template index<SymbolIndex>(src)));
  97. enc<I - 1>().template tail<Codec, CodecVariant>(encoded, state, src, num_symbols);
  98. }
  99. };
  100. template<> // terminating specialization
  101. struct enc<0> {
  102. template <typename Codec, typename CodecVariant, typename Result, typename ResultState>
  103. static CPPCODEC_ALWAYS_INLINE void block(Result&, ResultState&, const uint8_t*) { }
  104. template <typename Codec, typename CodecVariant, typename Result, typename ResultState,
  105. typename EncodedBlockSizeT = decltype(Codec::encoded_block_size())>
  106. static CPPCODEC_ALWAYS_INLINE void tail(Result&, ResultState&, const uint8_t*, EncodedBlockSizeT)
  107. {
  108. abort(); // Not reached: block() should be called if num_symbols == block size, not tail().
  109. }
  110. };
  111. template <typename Codec, typename CodecVariant>
  112. template <typename Result, typename ResultState>
  113. inline void stream_codec<Codec, CodecVariant>::encode(
  114. Result& encoded_result, ResultState& state,
  115. const uint8_t* src, size_t src_size)
  116. {
  117. using encoder = enc<Codec::encoded_block_size()>;
  118. const uint8_t* src_end = src + src_size;
  119. if (src_size >= Codec::binary_block_size()) {
  120. src_end -= Codec::binary_block_size();
  121. for (; src <= src_end; src += Codec::binary_block_size()) {
  122. encoder::template block<Codec, CodecVariant>(encoded_result, state, src);
  123. }
  124. src_end += Codec::binary_block_size();
  125. }
  126. if (src_end > src) {
  127. auto remaining_src_len = src_end - src;
  128. if (!remaining_src_len || remaining_src_len >= Codec::binary_block_size()) {
  129. abort();
  130. return;
  131. }
  132. auto num_symbols = Codec::num_encoded_tail_symbols(static_cast<uint8_t>(remaining_src_len));
  133. encoder::template tail<Codec, CodecVariant>(encoded_result, state, src, num_symbols);
  134. }
  135. }
  136. // Range & lookup table generation, see
  137. // http://stackoverflow.com/questions/13313980/populate-an-array-using-constexpr-at-compile-time
  138. // and http://cplusadd.blogspot.ca/2013/02/c11-compile-time-lookup-tablearray-with.html
  139. template<unsigned... Is> struct seq {};
  140. template<unsigned N, unsigned... Is>
  141. struct gen_seq : gen_seq<N - 4, N - 4, N - 3, N - 2, N - 1, Is...> {
  142. // Clang up to 3.6 has a limit of 256 for template recursion,
  143. // so pass a few more symbols at once to make it work.
  144. static_assert(N % 4 == 0, "I must be divisible by 4 to eventually end at 0");
  145. };
  146. template<unsigned... Is>
  147. struct gen_seq<0, Is...> : seq<Is...> {};
  148. template <size_t N>
  149. struct lookup_table_t {
  150. alphabet_index_t lookup[N];
  151. static constexpr size_t size = N;
  152. };
  153. template<typename LambdaType, unsigned... Is>
  154. constexpr lookup_table_t<sizeof...(Is)> make_lookup_table(seq<Is...>, LambdaType value_for_index) {
  155. return { { value_for_index(Is)... } };
  156. }
  157. template<unsigned N, typename LambdaType>
  158. constexpr lookup_table_t<N> make_lookup_table(LambdaType evalFunc) {
  159. return make_lookup_table(gen_seq<N>(), evalFunc);
  160. }
  161. // CodecVariant::symbol() provides a symbol for an index.
  162. // Use recursive templates to get the inverse lookup table for fast decoding.
  163. template <typename T>
  164. static CPPCODEC_ALWAYS_INLINE constexpr size_t num_possible_values()
  165. {
  166. return static_cast<size_t>(
  167. static_cast<intmax_t>(std::numeric_limits<T>::max())
  168. - static_cast<intmax_t>(std::numeric_limits<T>::min()) + 1);
  169. }
  170. template <typename CodecVariant, alphabet_index_t InvalidIdx, size_t I>
  171. struct index_if_in_alphabet {
  172. static CPPCODEC_ALWAYS_INLINE constexpr alphabet_index_t for_symbol(char symbol)
  173. {
  174. return (CodecVariant::symbol(
  175. static_cast<alphabet_index_t>(CodecVariant::alphabet_size() - I)) == symbol)
  176. ? static_cast<alphabet_index_t>(CodecVariant::alphabet_size() - I)
  177. : index_if_in_alphabet<CodecVariant, InvalidIdx, I - 1>::for_symbol(symbol);
  178. }
  179. };
  180. template <typename CodecVariant, alphabet_index_t InvalidIdx>
  181. struct index_if_in_alphabet<CodecVariant, InvalidIdx, 0> { // terminating specialization
  182. static CPPCODEC_ALWAYS_INLINE constexpr alphabet_index_t for_symbol(char)
  183. {
  184. return InvalidIdx;
  185. }
  186. };
  187. template <typename CodecVariant, size_t I>
  188. struct padding_searcher {
  189. static CPPCODEC_ALWAYS_INLINE constexpr bool exists_padding_symbol()
  190. {
  191. // Clang up to 3.6 has a limit of 256 for template recursion,
  192. // so pass a few more symbols at once to make it work.
  193. static_assert(I % 4 == 0, "I must be divisible by 4 to eventually end at 0");
  194. return CodecVariant::is_padding_symbol(
  195. static_cast<char>(num_possible_values<char>() - I - 4))
  196. || CodecVariant::is_padding_symbol(
  197. static_cast<char>(num_possible_values<char>() - I - 3))
  198. || CodecVariant::is_padding_symbol(
  199. static_cast<char>(num_possible_values<char>() - I - 2))
  200. || CodecVariant::is_padding_symbol(
  201. static_cast<char>(num_possible_values<char>() - I - 1))
  202. || padding_searcher<CodecVariant, I - 4>::exists_padding_symbol();
  203. }
  204. };
  205. template <typename CodecVariant>
  206. struct padding_searcher<CodecVariant, 0> { // terminating specialization
  207. static CPPCODEC_ALWAYS_INLINE constexpr bool exists_padding_symbol() { return false; }
  208. };
  209. template <typename CodecVariant>
  210. struct alphabet_index_info
  211. {
  212. static constexpr const size_t num_possible_symbols = num_possible_values<char>();
  213. static constexpr const alphabet_index_t padding_idx = 1 << 8;
  214. static constexpr const alphabet_index_t invalid_idx = 1 << 9;
  215. static constexpr const alphabet_index_t eof_idx = 1 << 10;
  216. static constexpr const alphabet_index_t stop_character_mask = static_cast<alphabet_index_t>(~0xFFu);
  217. static constexpr const bool padding_allowed = padding_searcher<
  218. CodecVariant, num_possible_symbols>::exists_padding_symbol();
  219. static CPPCODEC_ALWAYS_INLINE constexpr bool allows_padding()
  220. {
  221. return padding_allowed;
  222. }
  223. static CPPCODEC_ALWAYS_INLINE constexpr bool is_padding(alphabet_index_t idx)
  224. {
  225. return allows_padding() ? (idx == padding_idx) : false;
  226. }
  227. static CPPCODEC_ALWAYS_INLINE constexpr bool is_invalid(alphabet_index_t idx) { return idx == invalid_idx; }
  228. static CPPCODEC_ALWAYS_INLINE constexpr bool is_eof(alphabet_index_t idx) { return idx == eof_idx; }
  229. static CPPCODEC_ALWAYS_INLINE constexpr bool is_stop_character(alphabet_index_t idx)
  230. {
  231. return (idx & stop_character_mask) != 0;
  232. }
  233. private:
  234. static CPPCODEC_ALWAYS_INLINE constexpr
  235. alphabet_index_t valid_index_or(alphabet_index_t a, alphabet_index_t b)
  236. {
  237. return a == invalid_idx ? b : a;
  238. }
  239. using idx_if_in_alphabet = index_if_in_alphabet<
  240. CodecVariant, invalid_idx, CodecVariant::alphabet_size()>;
  241. static CPPCODEC_ALWAYS_INLINE constexpr alphabet_index_t index_of(char symbol)
  242. {
  243. return valid_index_or(idx_if_in_alphabet::for_symbol(symbol),
  244. CodecVariant::is_eof_symbol(symbol) ? eof_idx
  245. : CodecVariant::is_padding_symbol(symbol) ? padding_idx
  246. : invalid_idx);
  247. }
  248. // GCC <= 4.9 has a bug with retaining constexpr when passing a function pointer.
  249. // To get around this, we'll create a callable with operator() and pass that one.
  250. // Unfortunately, MSVC prior to VS 2017 (for MinSizeRel or Release builds)
  251. // chokes on this by compiling the project in 20 minutes instead of seconds.
  252. // So let's define two separate variants and remove the old GCC one whenever we
  253. // decide not to support GCC < 5.0 anymore.
  254. #if defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 5
  255. struct index_at {
  256. CPPCODEC_ALWAYS_INLINE constexpr alphabet_index_t operator()(size_t symbol) const {
  257. return index_of(CodecVariant::normalized_symbol(static_cast<char>(symbol)));
  258. }
  259. };
  260. #else
  261. static CPPCODEC_ALWAYS_INLINE constexpr alphabet_index_t index_at(size_t symbol)
  262. {
  263. return index_of(CodecVariant::normalized_symbol(static_cast<char>(symbol)));
  264. }
  265. #endif
  266. public:
  267. struct lookup {
  268. static CPPCODEC_ALWAYS_INLINE alphabet_index_t for_symbol(char symbol)
  269. {
  270. #if defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 5
  271. static constexpr const auto t = make_lookup_table<num_possible_symbols>(index_at());
  272. #else
  273. static constexpr const auto t = make_lookup_table<num_possible_symbols>(&index_at);
  274. #endif
  275. static_assert(t.size == num_possible_symbols,
  276. "lookup table must cover each possible (character) symbol");
  277. return t.lookup[static_cast<uint8_t>(symbol)];
  278. }
  279. };
  280. };
  281. //
  282. // At long last! The actual decode/encode functions.
  283. template <typename Codec, typename CodecVariant>
  284. template <typename Result, typename ResultState>
  285. inline void stream_codec<Codec, CodecVariant>::decode(
  286. Result& binary_result, ResultState& state,
  287. const char* src_encoded, size_t src_size)
  288. {
  289. using alphabet_index_lookup = typename alphabet_index_info<CodecVariant>::lookup;
  290. const char* src = src_encoded;
  291. const char* src_end = src + src_size;
  292. alphabet_index_t alphabet_indexes[Codec::encoded_block_size()] = {};
  293. alphabet_indexes[0] = alphabet_index_info<CodecVariant>::eof_idx;
  294. alphabet_index_t* const alphabet_index_start = &alphabet_indexes[0];
  295. alphabet_index_t* const alphabet_index_end = &alphabet_indexes[Codec::encoded_block_size()];
  296. alphabet_index_t* alphabet_index_ptr = &alphabet_indexes[0];
  297. while (src < src_end) {
  298. if (CodecVariant::should_ignore(*src)) {
  299. ++src;
  300. continue;
  301. }
  302. *alphabet_index_ptr = alphabet_index_lookup::for_symbol(*src);
  303. if (alphabet_index_info<CodecVariant>::is_stop_character(*alphabet_index_ptr)) {
  304. break;
  305. }
  306. ++src;
  307. ++alphabet_index_ptr;
  308. if (alphabet_index_ptr == alphabet_index_end) {
  309. Codec::decode_block(binary_result, state, alphabet_indexes);
  310. alphabet_index_ptr = alphabet_index_start;
  311. }
  312. }
  313. if (alphabet_index_info<CodecVariant>::is_invalid(*alphabet_index_ptr)) {
  314. throw symbol_error(*src);
  315. }
  316. ++src;
  317. alphabet_index_t* last_index_ptr = alphabet_index_ptr;
  318. if (alphabet_index_info<CodecVariant>::is_padding(*last_index_ptr)) {
  319. if (last_index_ptr == alphabet_index_start) {
  320. // Don't accept padding at the start of a block.
  321. // The encoder should have omitted that padding altogether.
  322. throw padding_error();
  323. }
  324. // We're in here because we just read a (first) padding character. Try to read more.
  325. // Count with last_index_ptr, but store in alphabet_index_ptr so we don't
  326. // overflow the array in case the input data is too long.
  327. ++last_index_ptr;
  328. while (src < src_end) {
  329. *alphabet_index_ptr = alphabet_index_lookup::for_symbol(*(src++));
  330. if (alphabet_index_info<CodecVariant>::is_eof(*alphabet_index_ptr)) {
  331. *alphabet_index_ptr = alphabet_index_info<CodecVariant>::padding_idx;
  332. break;
  333. }
  334. if (!alphabet_index_info<CodecVariant>::is_padding(*alphabet_index_ptr)) {
  335. throw padding_error();
  336. }
  337. ++last_index_ptr;
  338. if (last_index_ptr > alphabet_index_end) {
  339. throw padding_error();
  340. }
  341. }
  342. }
  343. if (last_index_ptr != alphabet_index_start) {
  344. if ((CodecVariant::requires_padding()
  345. || alphabet_index_info<CodecVariant>::is_padding(*alphabet_index_ptr)
  346. ) && last_index_ptr != alphabet_index_end)
  347. {
  348. // If the input is not a multiple of the block size then the input is incorrect.
  349. throw padding_error();
  350. }
  351. if (alphabet_index_ptr >= alphabet_index_end) {
  352. abort();
  353. return;
  354. }
  355. Codec::decode_tail(binary_result, state, alphabet_indexes,
  356. static_cast<size_t>(alphabet_index_ptr - alphabet_index_start));
  357. }
  358. }
  359. template <typename Codec, typename CodecVariant>
  360. inline constexpr size_t stream_codec<Codec, CodecVariant>::encoded_size(size_t binary_size) noexcept
  361. {
  362. using C = Codec;
  363. // constexpr rules make this a lot harder to read than it actually is.
  364. return CodecVariant::generates_padding()
  365. // With padding, the encoded size is a multiple of the encoded block size.
  366. // To calculate that, round the binary size up to multiple of the binary block size,
  367. // then convert to encoded by multiplying with { base32: 8/5, base64: 4/3 }.
  368. ? (binary_size + (C::binary_block_size() - 1)
  369. - ((binary_size + (C::binary_block_size() - 1)) % C::binary_block_size()))
  370. * C::encoded_block_size() / C::binary_block_size()
  371. // No padding: only pad to the next multiple of 5 bits, i.e. at most a single extra byte.
  372. : (binary_size * C::encoded_block_size() / C::binary_block_size())
  373. + (((binary_size * C::encoded_block_size()) % C::binary_block_size()) ? 1 : 0);
  374. }
  375. template <typename Codec, typename CodecVariant>
  376. inline constexpr size_t stream_codec<Codec, CodecVariant>::decoded_max_size(size_t encoded_size) noexcept
  377. {
  378. using C = Codec;
  379. return CodecVariant::requires_padding()
  380. ? (encoded_size / C::encoded_block_size() * C::binary_block_size())
  381. : (encoded_size / C::encoded_block_size() * C::binary_block_size())
  382. + ((encoded_size % C::encoded_block_size())
  383. * C::binary_block_size() / C::encoded_block_size());
  384. }
  385. } // namespace detail
  386. } // namespace cppcodec
  387. #endif // CPPCODEC_DETAIL_STREAM_CODEC