crypto_aesctr_aesni.c 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. #include "cpusupport.h"
  2. #ifdef CPUSUPPORT_X86_AESNI
  3. /**
  4. * CPUSUPPORT CFLAGS: X86_AESNI
  5. */
  6. #include <assert.h>
  7. #include <stdint.h>
  8. #include <string.h>
  9. #include <emmintrin.h>
  10. #include "crypto_aes.h"
  11. #include "crypto_aes_aesni_m128i.h"
  12. #include "sysendian.h"
  13. #include "crypto_aesctr_aesni.h"
  14. /**
  15. * In order to optimize AES-CTR, it is desirable to separate out the handling
  16. * of individual bytes of data vs. the handling of complete (16 byte) blocks.
  17. * The handling of blocks in turn can be optimized further using CPU
  18. * intrinsics, e.g. SSE2 on x86 CPUs; however while the byte-at-once code
  19. * remains the same across platforms it should be inlined into the same (CPU
  20. * feature specific) routines for performance reasons.
  21. *
  22. * In order to allow those generic functions to be inlined into multiple
  23. * functions in separate translation units, we place them into a "shared" C
  24. * file which is included in each of the platform-specific variants.
  25. */
  26. #include "crypto_aesctr_shared.c"
  27. #ifdef BROKEN_MM_LOADU_SI64
  28. #warning Working around compiler bug: _mm_loadu_si64 is missing
  29. #warning Updating to a newer compiler may improve performance
  30. #endif
  31. /**
  32. * load_si64(mem):
  33. * Load an unaligned 64-bit integer from memory into the lowest 64 bits of the
  34. * returned value. The contents of the upper 64 bits is not defined.
  35. */
  36. static inline __m128i
  37. load_si64(const void * mem)
  38. {
  39. #ifdef BROKEN_MM_LOADU_SI64
  40. return (_mm_castpd_si128(_mm_load_sd(mem)));
  41. #else
  42. return (_mm_loadu_si64(mem));
  43. #endif
  44. }
  45. /* Process multiple whole blocks by generating & using a cipherblock. */
  46. static void
  47. crypto_aesctr_aesni_stream_wholeblocks(struct crypto_aesctr * stream,
  48. const uint8_t ** inbuf, uint8_t ** outbuf, size_t * buflen)
  49. {
  50. __m128i bufsse;
  51. __m128i inbufsse;
  52. __m128i nonce_be;
  53. uint8_t block_counter_be_arr[8];
  54. uint64_t block_counter;
  55. size_t num_blocks;
  56. size_t i;
  57. /* Load local variables from stream. */
  58. nonce_be = load_si64(stream->pblk);
  59. block_counter = stream->bytectr / 16;
  60. /* How many blocks should we process? */
  61. num_blocks = (*buflen) / 16;
  62. /*
  63. * This is 'for (i = num_blocks; i > 0; i--)', but ensuring that the
  64. * compiler knows that we will execute the loop at least once.
  65. */
  66. i = num_blocks;
  67. do {
  68. /* Prepare counter. */
  69. be64enc(block_counter_be_arr, block_counter);
  70. /* Encrypt the cipherblock. */
  71. bufsse = load_si64(block_counter_be_arr);
  72. bufsse = _mm_unpacklo_epi64(nonce_be, bufsse);
  73. bufsse = crypto_aes_encrypt_block_aesni_m128i(bufsse,
  74. stream->key);
  75. /* Encrypt the byte(s). */
  76. inbufsse = _mm_loadu_si128((const __m128i *)(*inbuf));
  77. bufsse = _mm_xor_si128(inbufsse, bufsse);
  78. _mm_storeu_si128((__m128i *)(*outbuf), bufsse);
  79. /* Update the positions. */
  80. block_counter++;
  81. *inbuf += 16;
  82. *outbuf += 16;
  83. /* Update the counter. */
  84. i--;
  85. } while (i > 0);
  86. /* Update the overall buffer length. */
  87. *buflen -= 16 * num_blocks;
  88. /* Update variables in stream. */
  89. memcpy(stream->pblk + 8, block_counter_be_arr, 8);
  90. stream->bytectr += 16 * num_blocks;
  91. }
  92. /**
  93. * crypto_aesctr_aesni_stream(stream, inbuf, outbuf, buflen):
  94. * Generate the next ${buflen} bytes of the AES-CTR stream ${stream} and xor
  95. * them with bytes from ${inbuf}, writing the result into ${outbuf}. If the
  96. * buffers ${inbuf} and ${outbuf} overlap, they must be identical.
  97. */
  98. void
  99. crypto_aesctr_aesni_stream(struct crypto_aesctr * stream, const uint8_t * inbuf,
  100. uint8_t * outbuf, size_t buflen)
  101. {
  102. /* Process any bytes before we can process a whole block. */
  103. if (crypto_aesctr_stream_pre_wholeblock(stream, &inbuf, &outbuf,
  104. &buflen))
  105. return;
  106. /* Process whole blocks of 16 bytes. */
  107. if (buflen >= 16)
  108. crypto_aesctr_aesni_stream_wholeblocks(stream, &inbuf,
  109. &outbuf, &buflen);
  110. /* Process any final bytes after finishing all whole blocks. */
  111. crypto_aesctr_stream_post_wholeblock(stream, &inbuf, &outbuf, &buflen);
  112. }
  113. #endif /* CPUSUPPORT_X86_AESNI */