cipher-gcm-armv7-neon.S 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. /* cipher-gcm-armv7-neon.S - ARM/NEON accelerated GHASH
  2. * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  3. *
  4. * This file is part of Libgcrypt.
  5. *
  6. * Libgcrypt is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as
  8. * published by the Free Software Foundation; either version 2.1 of
  9. * the License, or (at your option) any later version.
  10. *
  11. * Libgcrypt is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #include <config.h>
  20. #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
  21. defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
  22. defined(HAVE_GCC_INLINE_ASM_NEON)
  23. .syntax unified
  24. .fpu neon
  25. .arm
  26. .text
  27. #ifdef __PIC__
  28. # define GET_DATA_POINTER(reg, name, rtmp) \
  29. ldr reg, 1f; \
  30. ldr rtmp, 2f; \
  31. b 3f; \
  32. 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \
  33. 2: .word name(GOT); \
  34. 3: add reg, pc, reg; \
  35. ldr reg, [reg, rtmp];
  36. #else
  37. # define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
  38. #endif
  39. /* Constants */
  40. .align 4
  41. gcry_gcm_reduction_constant:
  42. .Lrconst64:
  43. .quad 0xc200000000000000
  44. /* Register macros */
  45. #define rhash q0
  46. #define rhash_l d0
  47. #define rhash_h d1
  48. #define rh1 q1
  49. #define rh1_l d2
  50. #define rh1_h d3
  51. #define rbuf q2
  52. #define rbuf_l d4
  53. #define rbuf_h d5
  54. #define rbuf1 q3
  55. #define rbuf1_l d6
  56. #define rbuf1_h d7
  57. #define t0q q4
  58. #define t0l d8
  59. #define t0h d9
  60. #define t1q q5
  61. #define t1l d10
  62. #define t1h d11
  63. #define t2q q6
  64. #define t2l d12
  65. #define t2h d13
  66. #define t3q q7
  67. #define t3l d14
  68. #define t3h d15
  69. /* q8 */
  70. #define k16 d16
  71. #define k32 d17
  72. /* q9 */
  73. #define k48 d18
  74. #define k0 q10
  75. #define rr0 q11
  76. #define rr0_l d22
  77. #define rr0_h d23
  78. #define rr1 q12
  79. #define rr1_l d24
  80. #define rr1_h d25
  81. #define rt0 q13
  82. #define rt0_l d26
  83. #define rt0_h d27
  84. #define rt1 q14
  85. #define rt1_l d28
  86. #define rt1_h d29
  87. #define rrconst q15
  88. #define rrconst_l d30
  89. #define rrconst_h d31
  90. /* Macro for 64x64=>128 carry-less multiplication using vmull.p8 instruction.
  91. *
  92. * From "Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R. Fast Software
  93. * Polynomial Multiplication on ARM Processors using the NEON Engine. The
  94. * Second International Workshop on Modern Cryptography and Security
  95. * Engineering — MoCrySEn, 2013". */
  96. #define vmull_p64(rq, rl, rh, ad, bd) \
  97. vext.8 t0l, ad, ad, #1; \
  98. vmull.p8 t0q, t0l, bd; \
  99. vext.8 rl, bd, bd, #1; \
  100. vmull.p8 rq, ad, rl; \
  101. vext.8 t1l, ad, ad, #2; \
  102. vmull.p8 t1q, t1l, bd; \
  103. vext.8 t3l, bd, bd, #2; \
  104. vmull.p8 t3q, ad, t3l; \
  105. vext.8 t2l, ad, ad, #3; \
  106. vmull.p8 t2q, t2l, bd; \
  107. veor t0q, t0q, rq; \
  108. vext.8 rl, bd, bd, #3; \
  109. vmull.p8 rq, ad, rl; \
  110. veor t1q, t1q, t3q; \
  111. vext.8 t3l, bd, bd, #4; \
  112. vmull.p8 t3q, ad, t3l; \
  113. veor t0l, t0l, t0h; \
  114. vand t0h, t0h, k48; \
  115. veor t1l, t1l, t1h; \
  116. vand t1h, t1h, k32; \
  117. veor t2q, t2q, rq; \
  118. veor t0l, t0l, t0h; \
  119. veor t1l, t1l, t1h; \
  120. veor t2l, t2l, t2h; \
  121. vand t2h, t2h, k16; \
  122. veor t3l, t3l, t3h; \
  123. vmov.i64 t3h, #0; \
  124. vext.8 t0q, t0q, t0q, #15; \
  125. veor t2l, t2l, t2h; \
  126. vext.8 t1q, t1q, t1q, #14; \
  127. vmull.p8 rq, ad, bd; \
  128. vext.8 t2q, t2q, t2q, #13; \
  129. vext.8 t3q, t3q, t3q, #12; \
  130. veor t0q, t0q, t1q; \
  131. veor t2q, t2q, t3q; \
  132. veor rq, rq, t0q; \
  133. veor rq, rq, t2q;
  134. /* GHASH macros.
  135. *
  136. * See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
  137. * Cryptology — CT-RSA 2015" for details.
  138. */
  139. /* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1)
  140. * Note: 'r1' may be 'a' or 'b', 'r0' must not be either 'a' or 'b'.
  141. */
  142. #define PMUL_128x128(r0, r1, a, b, t1, t2, interleave_op) \
  143. veor t1##_h, b##_l, b##_h; \
  144. veor t1##_l, a##_l, a##_h; \
  145. vmull_p64( r0, r0##_l, r0##_h, a##_l, b##_l ); \
  146. vmull_p64( r1, r1##_l, r1##_h, a##_h, b##_h ); \
  147. vmull_p64( t2, t2##_h, t2##_l, t1##_h, t1##_l ); \
  148. interleave_op; \
  149. veor t2, r0; \
  150. veor t2, r1; \
  151. veor r0##_h, t2##_l; \
  152. veor r1##_l, t2##_h;
  153. /* Reduction using Xor and Shift.
  154. * Input: 'r0:r1', Output: 'a'
  155. *
  156. * See "Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication
  157. * Instruction and its Usage for Computing the GCM Mode" for details.
  158. */
  159. #define REDUCTION(a, r0, r1, t, interleave_op) \
  160. vshl.u32 t0q, r0, #31; \
  161. vshl.u32 t1q, r0, #30; \
  162. vshl.u32 t2q, r0, #25; \
  163. veor t0q, t0q, t1q; \
  164. veor t0q, t0q, t2q; \
  165. vext.8 t, t0q, k0, #4; \
  166. vext.8 t0q, k0, t0q, #(16-12); \
  167. veor r0, r0, t0q; \
  168. interleave_op; \
  169. vshr.u32 t0q, r0, #1; \
  170. vshr.u32 t1q, r0, #2; \
  171. vshr.u32 t2q, r0, #7; \
  172. veor t0q, t0q, t1q; \
  173. veor t0q, t0q, t2q; \
  174. veor t0q, t0q, t; \
  175. veor r0, r0, t0q; \
  176. veor a, r0, r1;
  177. #define _(...) __VA_ARGS__
  178. #define __ _()
  179. /* Other functional macros */
  180. #define CLEAR_REG(reg) vmov.i8 reg, #0;
  181. /*
  182. * unsigned int _gcry_ghash_armv7_neon (void *gcm_key, byte *result,
  183. * const byte *buf, size_t nblocks);
  184. */
  185. .align 3
  186. .globl _gcry_ghash_armv7_neon
  187. .type _gcry_ghash_armv7_neon,%function;
  188. _gcry_ghash_armv7_neon:
  189. /* input:
  190. * r0: gcm_key
  191. * r1: result/hash
  192. * r2: buf
  193. * r3: nblocks
  194. */
  195. push {r4-r6, lr}
  196. cmp r3, #0
  197. beq .Ldo_nothing
  198. vpush {q4-q7}
  199. vld1.64 {rhash}, [r1]
  200. vld1.64 {rh1}, [r0]
  201. vrev64.8 rhash, rhash /* byte-swap */
  202. vmov.i64 k0, #0x0
  203. vmov.i64 k16, #0xffff
  204. vmov.i64 k32, #0xffffffff
  205. vmov.i64 k48, #0xffffffffffff
  206. vext.8 rhash, rhash, rhash, #8
  207. /* Handle remaining blocks. */
  208. vld1.64 {rbuf}, [r2]!
  209. subs r3, r3, #1
  210. vrev64.8 rbuf, rbuf /* byte-swap */
  211. vext.8 rbuf, rbuf, rbuf, #8
  212. veor rhash, rhash, rbuf
  213. beq .Lend
  214. .Loop:
  215. vld1.64 {rbuf}, [r2]!
  216. PMUL_128x128(rr0, rr1, rhash, rh1, rt0, rt1, _(vrev64.8 rbuf, rbuf))
  217. REDUCTION(rhash, rr0, rr1, rt0, _(vext.8 rbuf, rbuf, rbuf, #8))
  218. subs r3, r3, #1
  219. veor rhash, rhash, rbuf
  220. bne .Loop
  221. .Lend:
  222. PMUL_128x128(rr0, rr1, rhash, rh1, rt0, rt1, _(CLEAR_REG(rbuf)))
  223. REDUCTION(rhash, rr0, rr1, rt0, _(CLEAR_REG(rh1)))
  224. .Ldone:
  225. CLEAR_REG(rr1)
  226. vrev64.8 rhash, rhash /* byte-swap */
  227. CLEAR_REG(rt0)
  228. CLEAR_REG(rr0)
  229. vext.8 rhash, rhash, rhash, #8
  230. CLEAR_REG(rt1)
  231. CLEAR_REG(t0q)
  232. CLEAR_REG(t1q)
  233. CLEAR_REG(t2q)
  234. CLEAR_REG(t3q)
  235. vst1.64 {rhash}, [r1]
  236. CLEAR_REG(rhash)
  237. vpop {q4-q7}
  238. .Ldo_nothing:
  239. mov r0, #0
  240. pop {r4-r6, pc}
  241. .size _gcry_ghash_armv7_neon,.-_gcry_ghash_armv7_neon;
  242. /*
  243. * void _gcry_ghash_armv7_neon (void *gcm_key);
  244. */
  245. .align 3
  246. .globl _gcry_ghash_setup_armv7_neon
  247. .type _gcry_ghash_setup_armv7_neon,%function;
  248. _gcry_ghash_setup_armv7_neon:
  249. /* input:
  250. * r0: gcm_key
  251. */
  252. vpush {q4-q7}
  253. GET_DATA_POINTER(r2, .Lrconst64, r3)
  254. vld1.64 {rrconst_h}, [r2]
  255. #define GCM_LSH_1(r_out, ia, ib, const_d, oa, ob, ma) \
  256. /* H <<< 1 */ \
  257. vshr.s64 ma, ib, #63; \
  258. vshr.u64 oa, ib, #63; \
  259. vshr.u64 ob, ia, #63; \
  260. vand ma, const_d; \
  261. vshl.u64 ib, ib, #1; \
  262. vshl.u64 ia, ia, #1; \
  263. vorr ob, ib; \
  264. vorr oa, ia; \
  265. veor ob, ma; \
  266. vst1.64 {oa, ob}, [r_out]
  267. vld1.64 {rhash}, [r0]
  268. vrev64.8 rhash, rhash /* byte-swap */
  269. vext.8 rhash, rhash, rhash, #8
  270. vmov rbuf1, rhash
  271. GCM_LSH_1(r0, rhash_l, rhash_h, rrconst_h, rh1_l, rh1_h, rt1_l) /* H<<<1 */
  272. CLEAR_REG(rh1)
  273. CLEAR_REG(rhash)
  274. CLEAR_REG(rbuf1)
  275. CLEAR_REG(rrconst)
  276. vpop {q4-q7}
  277. bx lr
  278. .size _gcry_ghash_setup_armv7_neon,.-_gcry_ghash_setup_armv7_neon;
  279. #endif