cipher-gcm-armv8-aarch32-ce.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589
  1. /* cipher-gcm-armv8-aarch32-ce.S - ARM/CE accelerated GHASH
  2. * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  3. *
  4. * This file is part of Libgcrypt.
  5. *
  6. * Libgcrypt is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as
  8. * published by the Free Software Foundation; either version 2.1 of
  9. * the License, or (at your option) any later version.
  10. *
  11. * Libgcrypt is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #include <config.h>
  20. #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
  21. defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
  22. defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
  23. .syntax unified
  24. .arch armv8-a
  25. .fpu crypto-neon-fp-armv8
  26. .arm
  27. .text
  28. #ifdef __PIC__
  29. # define GET_DATA_POINTER(reg, name, rtmp) \
  30. ldr reg, 1f; \
  31. ldr rtmp, 2f; \
  32. b 3f; \
  33. 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \
  34. 2: .word name(GOT); \
  35. 3: add reg, pc, reg; \
  36. ldr reg, [reg, rtmp];
  37. #else
  38. # define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
  39. #endif
  40. /* Constants */
  41. .align 4
  42. gcry_gcm_reduction_constant:
  43. .Lrconst64:
  44. .quad 0xc200000000000000
  45. /* Register macros */
  46. #define rhash q0
  47. #define rhash_l d0
  48. #define rhash_h d1
  49. #define rh1 q1
  50. #define rh1_l d2
  51. #define rh1_h d3
  52. #define rbuf q2
  53. #define rbuf_l d4
  54. #define rbuf_h d5
  55. #define rbuf1 q3
  56. #define rbuf1_l d6
  57. #define rbuf1_h d7
  58. #define rbuf2 q4
  59. #define rbuf2_l d8
  60. #define rbuf2_h d9
  61. #define rbuf3 q5
  62. #define rbuf3_l d10
  63. #define rbuf3_h d11
  64. #define rh2 q6
  65. #define rh2_l d12
  66. #define rh2_h d13
  67. #define rh3 q7
  68. #define rh3_l d14
  69. #define rh3_h d15
  70. #define rh4 q8
  71. #define rh4_l d16
  72. #define rh4_h d17
  73. #define rr2 q9
  74. #define rr2_l d18
  75. #define rr2_h d19
  76. #define rr3 q10
  77. #define rr3_l d20
  78. #define rr3_h d21
  79. #define rr0 q11
  80. #define rr0_l d22
  81. #define rr0_h d23
  82. #define rr1 q12
  83. #define rr1_l d24
  84. #define rr1_h d25
  85. #define rt0 q13
  86. #define rt0_l d26
  87. #define rt0_h d27
  88. #define rt1 q14
  89. #define rt1_l d28
  90. #define rt1_h d29
  91. #define rrconst q15
  92. #define rrconst_l d30
  93. #define rrconst_h d31
  94. /* GHASH macros */
  95. /* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
  96. * Cryptology — CT-RSA 2015" for details.
  97. */
  98. /* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1)
  99. * Note: 'r1' may be 'a' or 'b', 'r0' must not be either 'a' or 'b'.
  100. */
  101. #define PMUL_128x128(r0, r1, a, b, t, interleave_op) \
  102. veor t##_h, b##_l, b##_h; \
  103. veor t##_l, a##_l, a##_h; \
  104. vmull.p64 r0, a##_l, b##_l; \
  105. vmull.p64 r1, a##_h, b##_h; \
  106. vmull.p64 t, t##_h, t##_l; \
  107. interleave_op; \
  108. veor t, r0; \
  109. veor t, r1; \
  110. veor r0##_h, t##_l; \
  111. veor r1##_l, t##_h;
  112. /* Input: 'aA' and 'bA', Output: 'r0A:r1A' (low 128-bits in r0A, high in r1A)
  113. * Note: 'r1A' may be 'aA' or 'bA', 'r0A' must not be either 'aA' or 'bA'.
  114. * Input: 'aB' and 'bB', Output: 'r0B:r1B' (low 128-bits in r0B, high in r1B)
  115. * Note: 'r1B' may be 'aB' or 'bB', 'r0B' must not be either 'aB' or 'bB'.
  116. */
  117. #define PMUL_128x128_2(r0A, r1A, aA, bA, r0B, r1B, aB, bB, tA, tB, interleave_op) \
  118. veor tA##_h, bA##_l, bA##_h; \
  119. veor tA##_l, aA##_l, aA##_h; \
  120. veor tB##_h, bB##_l, bB##_h; \
  121. veor tB##_l, aB##_l, aB##_h; \
  122. vmull.p64 r0A, aA##_l, bA##_l; \
  123. vmull.p64 r1A, aA##_h, bA##_h; \
  124. vmull.p64 tA, tA##_h, tA##_l; \
  125. vmull.p64 r0B, aB##_l, bB##_l; \
  126. vmull.p64 r1B, aB##_h, bB##_h; \
  127. vmull.p64 tB, tB##_h, tB##_l; \
  128. interleave_op; \
  129. veor tA, r0A; \
  130. veor tA, r1A; \
  131. veor tB, r0B; \
  132. veor tB, r1B; \
  133. veor r0A##_h, tA##_l; \
  134. veor r1A##_l, tA##_h; \
  135. veor r0B##_h, tB##_l; \
  136. veor r1B##_l, tB##_h; \
  137. /* Input: 'r0:r1', Output: 'a' */
  138. #define REDUCTION(a, r0, r1, rconst, t, interleave_op) \
  139. vmull.p64 t, r0##_l, rconst; \
  140. veor r0##_h, t##_l; \
  141. veor r1##_l, t##_h; \
  142. interleave_op; \
  143. vmull.p64 t, r0##_h, rconst; \
  144. veor r1, t; \
  145. veor a, r0, r1;
  146. #define _(...) __VA_ARGS__
  147. #define __ _()
  148. /* Other functional macros */
  149. #define CLEAR_REG(reg) vmov.i8 reg, #0;
  150. /*
  151. * unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result,
  152. * const byte *buf, size_t nblocks,
  153. * void *gcm_table);
  154. */
  155. .align 3
  156. .globl _gcry_ghash_armv8_ce_pmull
  157. .type _gcry_ghash_armv8_ce_pmull,%function;
  158. _gcry_ghash_armv8_ce_pmull:
  159. /* input:
  160. * r0: gcm_key
  161. * r1: result/hash
  162. * r2: buf
  163. * r3: nblocks
  164. * %st+0: gcm_table
  165. */
  166. push {r4-r6, lr}
  167. cmp r3, #0
  168. beq .Ldo_nothing
  169. GET_DATA_POINTER(r4, .Lrconst64, lr)
  170. vld1.64 {rhash}, [r1]
  171. vld1.64 {rh1}, [r0]
  172. vrev64.8 rhash, rhash /* byte-swap */
  173. vld1.64 {rrconst_h}, [r4]
  174. vext.8 rhash, rhash, rhash, #8
  175. cmp r3, #4
  176. blo .Less_than_4
  177. /* Bulk processing of 4 blocks per loop iteration. */
  178. ldr r5, [sp, #(4*4)];
  179. add r6, r5, #32
  180. vpush {q4-q7}
  181. vld1.64 {rh2-rh3}, [r5]
  182. vld1.64 {rh4}, [r6]
  183. vld1.64 {rbuf-rbuf1}, [r2]!
  184. sub r3, r3, #4
  185. vld1.64 {rbuf2-rbuf3}, [r2]!
  186. cmp r3, #4
  187. vrev64.8 rbuf, rbuf /* byte-swap */
  188. vrev64.8 rbuf1, rbuf1 /* byte-swap */
  189. vrev64.8 rbuf2, rbuf2 /* byte-swap */
  190. vrev64.8 rbuf3, rbuf3 /* byte-swap */
  191. vext.8 rbuf, rbuf, rbuf, #8
  192. vext.8 rbuf1, rbuf1, rbuf1, #8
  193. vext.8 rbuf2, rbuf2, rbuf2, #8
  194. vext.8 rbuf3, rbuf3, rbuf3, #8
  195. veor rhash, rhash, rbuf /* in0 ^ hash */
  196. blo .Lend_4
  197. .Loop_4:
  198. /* (in0 ^ hash) * H⁴ => rr2:rr3 */
  199. /* (in1) * H³ => rr0:rr1 */
  200. PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __)
  201. vld1.64 {rbuf-rbuf1}, [r2]!
  202. sub r3, r3, #4
  203. veor rr0, rr0, rr2
  204. veor rr1, rr1, rr3
  205. /* (in2) * H² => rr2:rr3 */
  206. /* (in3) * H¹ => rhash:rbuf3 */
  207. PMUL_128x128_2(rr2, rr3, rbuf2, rh2, rhash, rbuf3, rbuf3, rh1, rt0, rt1,
  208. _(vrev64.8 rbuf, rbuf))
  209. vld1.64 {rbuf2}, [r2]!
  210. vrev64.8 rbuf1, rbuf1
  211. veor rr0, rr0, rr2
  212. veor rr1, rr1, rr3
  213. cmp r3, #4
  214. vext.8 rbuf, rbuf, rbuf, #8
  215. vext.8 rbuf1, rbuf1, rbuf1, #8
  216. veor rr0, rr0, rhash
  217. veor rr1, rr1, rbuf3
  218. vld1.64 {rbuf3}, [r2]!
  219. REDUCTION(rhash, rr0, rr1, rrconst_h, rt1,
  220. _(vrev64.8 rbuf2, rbuf2;
  221. vrev64.8 rbuf3, rbuf3))
  222. vext.8 rbuf2, rbuf2, rbuf2, #8
  223. vext.8 rbuf3, rbuf3, rbuf3, #8
  224. veor rhash, rhash, rbuf /* in0 ^ hash */
  225. bhs .Loop_4
  226. .Lend_4:
  227. /* (in0 ^ hash) * H⁴ => rr2:rr3 */
  228. /* (in1) * H³ => rr0:rr1 */
  229. PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __)
  230. /* (in2) * H² => rhash:rbuf */
  231. /* (in3) * H¹ => rbuf1:rbuf2 */
  232. PMUL_128x128_2(rhash, rbuf, rbuf2, rh2, rbuf1, rbuf2, rbuf3, rh1, rt0, rt1,
  233. _(veor rr0, rr0, rr2;
  234. veor rr1, rr1, rr3))
  235. veor rr0, rr0, rhash
  236. veor rr1, rr1, rbuf
  237. veor rr0, rr0, rbuf1
  238. veor rr1, rr1, rbuf2
  239. REDUCTION(rhash, rr0, rr1, rrconst_h, rt1,
  240. _(CLEAR_REG(rr2);
  241. CLEAR_REG(rr3);
  242. CLEAR_REG(rbuf1);
  243. CLEAR_REG(rbuf2);
  244. CLEAR_REG(rbuf3);
  245. CLEAR_REG(rh2);
  246. CLEAR_REG(rh3);
  247. CLEAR_REG(rh4)))
  248. vpop {q4-q7}
  249. cmp r3, #0
  250. beq .Ldone
  251. .Less_than_4:
  252. /* Handle remaining blocks. */
  253. vld1.64 {rbuf}, [r2]!
  254. subs r3, r3, #1
  255. vrev64.8 rbuf, rbuf /* byte-swap */
  256. vext.8 rbuf, rbuf, rbuf, #8
  257. veor rhash, rhash, rbuf
  258. beq .Lend
  259. .Loop:
  260. vld1.64 {rbuf}, [r2]!
  261. subs r3, r3, #1
  262. PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(vrev64.8 rbuf, rbuf))
  263. REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(vext.8 rbuf, rbuf, rbuf, #8))
  264. veor rhash, rhash, rbuf
  265. bne .Loop
  266. .Lend:
  267. PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(CLEAR_REG(rbuf)))
  268. REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(CLEAR_REG(rh1)))
  269. .Ldone:
  270. CLEAR_REG(rr1)
  271. vrev64.8 rhash, rhash /* byte-swap */
  272. CLEAR_REG(rt0)
  273. CLEAR_REG(rr0)
  274. vext.8 rhash, rhash, rhash, #8
  275. CLEAR_REG(rt1)
  276. vst1.64 {rhash}, [r1]
  277. CLEAR_REG(rhash)
  278. .Ldo_nothing:
  279. mov r0, #0
  280. pop {r4-r6, pc}
  281. .size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull;
  282. /*
  283. * unsigned int _gcry_polyval_armv8_ce_pmull (void *gcm_key, byte *result,
  284. * const byte *buf, size_t nblocks,
  285. * void *gcm_table);
  286. */
  287. .align 3
  288. .globl _gcry_polyval_armv8_ce_pmull
  289. .type _gcry_polyval_armv8_ce_pmull,%function;
  290. _gcry_polyval_armv8_ce_pmull:
  291. /* input:
  292. * r0: gcm_key
  293. * r1: result/hash
  294. * r2: buf
  295. * r3: nblocks
  296. * %st+0: gcm_table
  297. */
  298. push {r4-r6, lr}
  299. cmp r3, #0
  300. beq .Lpolyval_do_nothing
  301. GET_DATA_POINTER(r4, .Lrconst64, lr)
  302. vld1.64 {rhash}, [r1]
  303. vld1.64 {rh1}, [r0]
  304. vrev64.8 rhash, rhash /* byte-swap */
  305. vld1.64 {rrconst_h}, [r4]
  306. vext.8 rhash, rhash, rhash, #8
  307. cmp r3, #4
  308. blo .Lpolyval_less_than_4
  309. /* Bulk processing of 4 blocks per loop iteration. */
  310. ldr r5, [sp, #(4*4)];
  311. add r6, r5, #32
  312. vpush {q4-q7}
  313. vld1.64 {rh2-rh3}, [r5]
  314. vld1.64 {rh4}, [r6]
  315. vld1.64 {rbuf-rbuf1}, [r2]!
  316. sub r3, r3, #4
  317. vld1.64 {rbuf2-rbuf3}, [r2]!
  318. cmp r3, #4
  319. veor rhash, rhash, rbuf /* in0 ^ hash */
  320. blo .Lpolyval_end_4
  321. .Lpolyval_loop_4:
  322. /* (in0 ^ hash) * H⁴ => rr2:rr3 */
  323. /* (in1) * H³ => rr0:rr1 */
  324. PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __)
  325. vld1.64 {rbuf-rbuf1}, [r2]!
  326. sub r3, r3, #4
  327. veor rr0, rr0, rr2
  328. veor rr1, rr1, rr3
  329. /* (in2) * H² => rr2:rr3 */
  330. /* (in3) * H¹ => rhash:rbuf3 */
  331. PMUL_128x128_2(rr2, rr3, rbuf2, rh2, rhash, rbuf3, rbuf3, rh1, rt0, rt1, __)
  332. vld1.64 {rbuf2}, [r2]!
  333. veor rr0, rr0, rr2
  334. veor rr1, rr1, rr3
  335. cmp r3, #4
  336. veor rr0, rr0, rhash
  337. veor rr1, rr1, rbuf3
  338. vld1.64 {rbuf3}, [r2]!
  339. REDUCTION(rhash, rr0, rr1, rrconst_h, rt1, __)
  340. veor rhash, rhash, rbuf /* in0 ^ hash */
  341. bhs .Lpolyval_loop_4
  342. .Lpolyval_end_4:
  343. /* (in0 ^ hash) * H⁴ => rr2:rr3 */
  344. /* (in1) * H³ => rr0:rr1 */
  345. PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __)
  346. /* (in2) * H² => rhash:rbuf */
  347. /* (in3) * H¹ => rbuf1:rbuf2 */
  348. PMUL_128x128_2(rhash, rbuf, rbuf2, rh2, rbuf1, rbuf2, rbuf3, rh1, rt0, rt1,
  349. _(veor rr0, rr0, rr2;
  350. veor rr1, rr1, rr3))
  351. veor rr0, rr0, rhash
  352. veor rr1, rr1, rbuf
  353. veor rr0, rr0, rbuf1
  354. veor rr1, rr1, rbuf2
  355. REDUCTION(rhash, rr0, rr1, rrconst_h, rt1,
  356. _(CLEAR_REG(rr2);
  357. CLEAR_REG(rr3);
  358. CLEAR_REG(rbuf1);
  359. CLEAR_REG(rbuf2);
  360. CLEAR_REG(rbuf3);
  361. CLEAR_REG(rh2);
  362. CLEAR_REG(rh3);
  363. CLEAR_REG(rh4)))
  364. vpop {q4-q7}
  365. cmp r3, #0
  366. beq .Lpolyval_done
  367. .Lpolyval_less_than_4:
  368. /* Handle remaining blocks. */
  369. vld1.64 {rbuf}, [r2]!
  370. subs r3, r3, #1
  371. veor rhash, rhash, rbuf
  372. beq .Lpolyval_end
  373. .Lpolyval_loop:
  374. vld1.64 {rbuf}, [r2]!
  375. subs r3, r3, #1
  376. PMUL_128x128(rr0, rr1, rhash, rh1, rt0, __)
  377. REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, __)
  378. veor rhash, rhash, rbuf
  379. bne .Lpolyval_loop
  380. .Lpolyval_end:
  381. PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(CLEAR_REG(rbuf)))
  382. REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(CLEAR_REG(rh1)))
  383. .Lpolyval_done:
  384. CLEAR_REG(rr1)
  385. vrev64.8 rhash, rhash /* byte-swap */
  386. CLEAR_REG(rt0)
  387. CLEAR_REG(rr0)
  388. vext.8 rhash, rhash, rhash, #8
  389. CLEAR_REG(rt1)
  390. vst1.64 {rhash}, [r1]
  391. CLEAR_REG(rhash)
  392. .Lpolyval_do_nothing:
  393. mov r0, #0
  394. pop {r4-r6, pc}
  395. .size _gcry_polyval_armv8_ce_pmull,.-_gcry_polyval_armv8_ce_pmull;
  396. /*
  397. * void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table);
  398. */
  399. .align 3
  400. .globl _gcry_ghash_setup_armv8_ce_pmull
  401. .type _gcry_ghash_setup_armv8_ce_pmull,%function;
  402. _gcry_ghash_setup_armv8_ce_pmull:
  403. /* input:
  404. * r0: gcm_key
  405. * r1: gcm_table
  406. */
  407. vpush {q4-q7}
  408. GET_DATA_POINTER(r2, .Lrconst64, r3)
  409. vld1.64 {rrconst_h}, [r2]
  410. #define GCM_LSH_1(r_out, ia, ib, const_d, oa, ob, ma) \
  411. /* H <<< 1 */ \
  412. vshr.s64 ma, ib, #63; \
  413. vshr.u64 oa, ib, #63; \
  414. vshr.u64 ob, ia, #63; \
  415. vand ma, const_d; \
  416. vshl.u64 ib, ib, #1; \
  417. vshl.u64 ia, ia, #1; \
  418. vorr ob, ib; \
  419. vorr oa, ia; \
  420. veor ob, ma; \
  421. vst1.64 {oa, ob}, [r_out]
  422. vld1.64 {rhash}, [r0]
  423. vrev64.8 rhash, rhash /* byte-swap */
  424. vext.8 rhash, rhash, rhash, #8
  425. vmov rbuf1, rhash
  426. GCM_LSH_1(r0, rhash_l, rhash_h, rrconst_h, rh1_l, rh1_h, rt1_l) /* H<<<1 */
  427. /* H² */
  428. PMUL_128x128(rr0, rr1, rbuf1, rh1, rt0, __)
  429. REDUCTION(rh2, rr0, rr1, rrconst_h, rt0, __)
  430. vmov rhash, rh2
  431. GCM_LSH_1(r1, rh2_l, rh2_h, rrconst_h, rbuf1_l, rbuf1_h, rt1_l) /* H²<<<1 */
  432. add r1, r1, #16
  433. /* H³ */
  434. PMUL_128x128(rr0, rr1, rhash, rh1, rt1, __)
  435. REDUCTION(rh3, rr0, rr1, rrconst_h, rt1, __)
  436. /* H⁴ */
  437. PMUL_128x128(rr0, rr1, rhash, rbuf1, rt0, __)
  438. REDUCTION(rh4, rr0, rr1, rrconst_h, rt0, __)
  439. GCM_LSH_1(r1, rh3_l, rh3_h, rrconst_h, rt0_l, rt0_h, rt1_l) /* H³<<<1 */
  440. add r1, r1, #16
  441. GCM_LSH_1(r1, rh4_l, rh4_h, rrconst_h, rt0_l, rt0_h, rt1_l) /* H⁴<<<1 */
  442. CLEAR_REG(rt0)
  443. CLEAR_REG(rt1)
  444. CLEAR_REG(rr1)
  445. CLEAR_REG(rr0)
  446. CLEAR_REG(rh1)
  447. CLEAR_REG(rh2)
  448. CLEAR_REG(rh3)
  449. CLEAR_REG(rh4)
  450. CLEAR_REG(rhash)
  451. CLEAR_REG(rbuf1)
  452. CLEAR_REG(rrconst)
  453. vpop {q4-q7}
  454. bx lr
  455. .size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull;
  456. #endif