dec_loop.c 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. // The input consists of six character sets in the Base64 alphabet, which we
  2. // need to map back to the 6-bit values they represent. There are three ranges,
  3. // two singles, and then there's the rest.
  4. //
  5. // # From To Add Characters
  6. // 1 [43] [62] +19 +
  7. // 2 [47] [63] +16 /
  8. // 3 [48..57] [52..61] +4 0..9
  9. // 4 [65..90] [0..25] -65 A..Z
  10. // 5 [97..122] [26..51] -71 a..z
  11. // (6) Everything else => invalid input
  12. //
  13. // We will use lookup tables for character validation and offset computation.
  14. // Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this
  15. // allows to mask with 0x2F instead of 0x0F and thus save one constant
  16. // declaration (register and/or memory access).
  17. //
  18. // For offsets:
  19. // Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00)
  20. // 0000 = garbage
  21. // 0001 = /
  22. // 0010 = +
  23. // 0011 = 0-9
  24. // 0100 = A-Z
  25. // 0101 = A-Z
  26. // 0110 = a-z
  27. // 0111 = a-z
  28. // 1000 >= garbage
  29. //
  30. // For validation, here's the table.
  31. // A character is valid if and only if the AND of the 2 lookups equals 0:
  32. //
  33. // hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
  34. // LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A
  35. //
  36. // 0000 0x10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
  37. // andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
  38. //
  39. // 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
  40. // andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
  41. //
  42. // 0010 0x01 char ! " # $ % & ' ( ) * + , - . /
  43. // andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00
  44. //
  45. // 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
  46. // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
  47. //
  48. // 0100 0x04 char @ A B C D E F G H I J K L M N O
  49. // andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  50. //
  51. // 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _
  52. // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
  53. //
  54. // 0110 0x04 char ` a b c d e f g h i j k l m n o
  55. // andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
  56. // 0111 0x08 char p q r s t u v w x y z { | } ~
  57. // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
  58. //
  59. // 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
  60. // 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
  61. // 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
  62. // 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
  63. // 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
  64. // 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
  65. // 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
  66. // 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
  67. static inline int
  68. dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
  69. {
  70. const __m128i lut_lo = _mm_setr_epi8(
  71. 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
  72. 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
  73. const __m128i lut_hi = _mm_setr_epi8(
  74. 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
  75. 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
  76. const __m128i lut_roll = _mm_setr_epi8(
  77. 0, 16, 19, 4, -65, -65, -71, -71,
  78. 0, 0, 0, 0, 0, 0, 0, 0);
  79. const __m128i mask_2F = _mm_set1_epi8(0x2F);
  80. // Load input:
  81. __m128i str = _mm_loadu_si128((__m128i *) *s);
  82. // Table lookups:
  83. const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F);
  84. const __m128i lo_nibbles = _mm_and_si128(str, mask_2F);
  85. const __m128i hi = _mm_shuffle_epi8(lut_hi, hi_nibbles);
  86. const __m128i lo = _mm_shuffle_epi8(lut_lo, lo_nibbles);
  87. // Check for invalid input: if any "and" values from lo and hi are not
  88. // zero, fall back on bytewise code to do error checking and reporting:
  89. if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) {
  90. return 0;
  91. }
  92. const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F);
  93. const __m128i roll = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles));
  94. // Now simply add the delta values to the input:
  95. str = _mm_add_epi8(str, roll);
  96. // Reshuffle the input to packed 12-byte output format:
  97. str = dec_reshuffle(str);
  98. // Store the output:
  99. _mm_storeu_si128((__m128i *) *o, str);
  100. *s += 16;
  101. *o += 12;
  102. *rounds -= 1;
  103. return 1;
  104. }
  105. static inline void
  106. dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
  107. {
  108. if (*slen < 24) {
  109. return;
  110. }
  111. // Process blocks of 16 bytes per round. Because 4 extra zero bytes are
  112. // written after the output, ensure that there will be at least 8 bytes
  113. // of input data left to cover the gap. (6 data bytes and up to two
  114. // end-of-string markers.)
  115. size_t rounds = (*slen - 8) / 16;
  116. *slen -= rounds * 16; // 16 bytes consumed per round
  117. *olen += rounds * 12; // 12 bytes produced per round
  118. do {
  119. if (rounds >= 8) {
  120. if (dec_loop_ssse3_inner(s, o, &rounds) &&
  121. dec_loop_ssse3_inner(s, o, &rounds) &&
  122. dec_loop_ssse3_inner(s, o, &rounds) &&
  123. dec_loop_ssse3_inner(s, o, &rounds) &&
  124. dec_loop_ssse3_inner(s, o, &rounds) &&
  125. dec_loop_ssse3_inner(s, o, &rounds) &&
  126. dec_loop_ssse3_inner(s, o, &rounds) &&
  127. dec_loop_ssse3_inner(s, o, &rounds)) {
  128. continue;
  129. }
  130. break;
  131. }
  132. if (rounds >= 4) {
  133. if (dec_loop_ssse3_inner(s, o, &rounds) &&
  134. dec_loop_ssse3_inner(s, o, &rounds) &&
  135. dec_loop_ssse3_inner(s, o, &rounds) &&
  136. dec_loop_ssse3_inner(s, o, &rounds)) {
  137. continue;
  138. }
  139. break;
  140. }
  141. if (rounds >= 2) {
  142. if (dec_loop_ssse3_inner(s, o, &rounds) &&
  143. dec_loop_ssse3_inner(s, o, &rounds)) {
  144. continue;
  145. }
  146. break;
  147. }
  148. dec_loop_ssse3_inner(s, o, &rounds);
  149. break;
  150. } while (rounds > 0);
  151. // Adjust for any rounds that were skipped:
  152. *slen += rounds * 16;
  153. *olen -= rounds * 12;
  154. }