ghash-ce-core.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579
  1. /*
  2. * Accelerated GHASH implementation with ARMv8 PMULL instructions.
  3. *
  4. * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
  5. *
  6. * This program is free software; you can redistribute it and/or modify it
  7. * under the terms of the GNU General Public License version 2 as published
  8. * by the Free Software Foundation.
  9. */
  10. #include <linux/linkage.h>
  11. #include <asm/assembler.h>
  12. SHASH .req v0
  13. SHASH2 .req v1
  14. T1 .req v2
  15. T2 .req v3
  16. MASK .req v4
  17. XL .req v5
  18. XM .req v6
  19. XH .req v7
  20. IN1 .req v7
  21. k00_16 .req v8
  22. k32_48 .req v9
  23. t3 .req v10
  24. t4 .req v11
  25. t5 .req v12
  26. t6 .req v13
  27. t7 .req v14
  28. t8 .req v15
  29. t9 .req v16
  30. perm1 .req v17
  31. perm2 .req v18
  32. perm3 .req v19
  33. sh1 .req v20
  34. sh2 .req v21
  35. sh3 .req v22
  36. sh4 .req v23
  37. ss1 .req v24
  38. ss2 .req v25
  39. ss3 .req v26
  40. ss4 .req v27
  41. XL2 .req v8
  42. XM2 .req v9
  43. XH2 .req v10
  44. XL3 .req v11
  45. XM3 .req v12
  46. XH3 .req v13
  47. TT3 .req v14
  48. TT4 .req v15
  49. HH .req v16
  50. HH3 .req v17
  51. HH4 .req v18
  52. HH34 .req v19
  53. .text
  54. .arch armv8-a+crypto
  55. .macro __pmull_p64, rd, rn, rm
  56. pmull \rd\().1q, \rn\().1d, \rm\().1d
  57. .endm
  58. .macro __pmull2_p64, rd, rn, rm
  59. pmull2 \rd\().1q, \rn\().2d, \rm\().2d
  60. .endm
  61. .macro __pmull_p8, rq, ad, bd
  62. ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1
  63. ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2
  64. ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3
  65. __pmull_p8_\bd \rq, \ad
  66. .endm
  67. .macro __pmull2_p8, rq, ad, bd
  68. tbl t3.16b, {\ad\().16b}, perm1.16b // A1
  69. tbl t5.16b, {\ad\().16b}, perm2.16b // A2
  70. tbl t7.16b, {\ad\().16b}, perm3.16b // A3
  71. __pmull2_p8_\bd \rq, \ad
  72. .endm
  73. .macro __pmull_p8_SHASH, rq, ad
  74. __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
  75. .endm
  76. .macro __pmull_p8_SHASH2, rq, ad
  77. __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
  78. .endm
  79. .macro __pmull2_p8_SHASH, rq, ad
  80. __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
  81. .endm
  82. .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
  83. pmull\t t3.8h, t3.\nb, \bd // F = A1*B
  84. pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1
  85. pmull\t t5.8h, t5.\nb, \bd // H = A2*B
  86. pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2
  87. pmull\t t7.8h, t7.\nb, \bd // J = A3*B
  88. pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3
  89. pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4
  90. pmull\t \rq\().8h, \ad, \bd // D = A*B
  91. eor t3.16b, t3.16b, t4.16b // L = E + F
  92. eor t5.16b, t5.16b, t6.16b // M = G + H
  93. eor t7.16b, t7.16b, t8.16b // N = I + J
  94. uzp1 t4.2d, t3.2d, t5.2d
  95. uzp2 t3.2d, t3.2d, t5.2d
  96. uzp1 t6.2d, t7.2d, t9.2d
  97. uzp2 t7.2d, t7.2d, t9.2d
  98. // t3 = (L) (P0 + P1) << 8
  99. // t5 = (M) (P2 + P3) << 16
  100. eor t4.16b, t4.16b, t3.16b
  101. and t3.16b, t3.16b, k32_48.16b
  102. // t7 = (N) (P4 + P5) << 24
  103. // t9 = (K) (P6 + P7) << 32
  104. eor t6.16b, t6.16b, t7.16b
  105. and t7.16b, t7.16b, k00_16.16b
  106. eor t4.16b, t4.16b, t3.16b
  107. eor t6.16b, t6.16b, t7.16b
  108. zip2 t5.2d, t4.2d, t3.2d
  109. zip1 t3.2d, t4.2d, t3.2d
  110. zip2 t9.2d, t6.2d, t7.2d
  111. zip1 t7.2d, t6.2d, t7.2d
  112. ext t3.16b, t3.16b, t3.16b, #15
  113. ext t5.16b, t5.16b, t5.16b, #14
  114. ext t7.16b, t7.16b, t7.16b, #13
  115. ext t9.16b, t9.16b, t9.16b, #12
  116. eor t3.16b, t3.16b, t5.16b
  117. eor t7.16b, t7.16b, t9.16b
  118. eor \rq\().16b, \rq\().16b, t3.16b
  119. eor \rq\().16b, \rq\().16b, t7.16b
  120. .endm
  121. .macro __pmull_pre_p64
  122. add x8, x3, #16
  123. ld1 {HH.2d-HH4.2d}, [x8]
  124. trn1 SHASH2.2d, SHASH.2d, HH.2d
  125. trn2 T1.2d, SHASH.2d, HH.2d
  126. eor SHASH2.16b, SHASH2.16b, T1.16b
  127. trn1 HH34.2d, HH3.2d, HH4.2d
  128. trn2 T1.2d, HH3.2d, HH4.2d
  129. eor HH34.16b, HH34.16b, T1.16b
  130. movi MASK.16b, #0xe1
  131. shl MASK.2d, MASK.2d, #57
  132. .endm
  133. .macro __pmull_pre_p8
  134. ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
  135. eor SHASH2.16b, SHASH2.16b, SHASH.16b
  136. // k00_16 := 0x0000000000000000_000000000000ffff
  137. // k32_48 := 0x00000000ffffffff_0000ffffffffffff
  138. movi k32_48.2d, #0xffffffff
  139. mov k32_48.h[2], k32_48.h[0]
  140. ushr k00_16.2d, k32_48.2d, #32
  141. // prepare the permutation vectors
  142. mov_q x5, 0x080f0e0d0c0b0a09
  143. movi T1.8b, #8
  144. dup perm1.2d, x5
  145. eor perm1.16b, perm1.16b, T1.16b
  146. ushr perm2.2d, perm1.2d, #8
  147. ushr perm3.2d, perm1.2d, #16
  148. ushr T1.2d, perm1.2d, #24
  149. sli perm2.2d, perm1.2d, #56
  150. sli perm3.2d, perm1.2d, #48
  151. sli T1.2d, perm1.2d, #40
  152. // precompute loop invariants
  153. tbl sh1.16b, {SHASH.16b}, perm1.16b
  154. tbl sh2.16b, {SHASH.16b}, perm2.16b
  155. tbl sh3.16b, {SHASH.16b}, perm3.16b
  156. tbl sh4.16b, {SHASH.16b}, T1.16b
  157. ext ss1.8b, SHASH2.8b, SHASH2.8b, #1
  158. ext ss2.8b, SHASH2.8b, SHASH2.8b, #2
  159. ext ss3.8b, SHASH2.8b, SHASH2.8b, #3
  160. ext ss4.8b, SHASH2.8b, SHASH2.8b, #4
  161. .endm
  162. //
  163. // PMULL (64x64->128) based reduction for CPUs that can do
  164. // it in a single instruction.
  165. //
  166. .macro __pmull_reduce_p64
  167. pmull T2.1q, XL.1d, MASK.1d
  168. eor XM.16b, XM.16b, T1.16b
  169. mov XH.d[0], XM.d[1]
  170. mov XM.d[1], XL.d[0]
  171. eor XL.16b, XM.16b, T2.16b
  172. ext T2.16b, XL.16b, XL.16b, #8
  173. pmull XL.1q, XL.1d, MASK.1d
  174. .endm
  175. //
  176. // Alternative reduction for CPUs that lack support for the
  177. // 64x64->128 PMULL instruction
  178. //
  179. .macro __pmull_reduce_p8
  180. eor XM.16b, XM.16b, T1.16b
  181. mov XL.d[1], XM.d[0]
  182. mov XH.d[0], XM.d[1]
  183. shl T1.2d, XL.2d, #57
  184. shl T2.2d, XL.2d, #62
  185. eor T2.16b, T2.16b, T1.16b
  186. shl T1.2d, XL.2d, #63
  187. eor T2.16b, T2.16b, T1.16b
  188. ext T1.16b, XL.16b, XH.16b, #8
  189. eor T2.16b, T2.16b, T1.16b
  190. mov XL.d[1], T2.d[0]
  191. mov XH.d[0], T2.d[1]
  192. ushr T2.2d, XL.2d, #1
  193. eor XH.16b, XH.16b, XL.16b
  194. eor XL.16b, XL.16b, T2.16b
  195. ushr T2.2d, T2.2d, #6
  196. ushr XL.2d, XL.2d, #1
  197. .endm
  198. .macro __pmull_ghash, pn
  199. ld1 {SHASH.2d}, [x3]
  200. ld1 {XL.2d}, [x1]
  201. __pmull_pre_\pn
  202. /* do the head block first, if supplied */
  203. cbz x4, 0f
  204. ld1 {T1.2d}, [x4]
  205. mov x4, xzr
  206. b 3f
  207. 0: .ifc \pn, p64
  208. tbnz w0, #0, 2f // skip until #blocks is a
  209. tbnz w0, #1, 2f // round multiple of 4
  210. 1: ld1 {XM3.16b-TT4.16b}, [x2], #64
  211. sub w0, w0, #4
  212. rev64 T1.16b, XM3.16b
  213. rev64 T2.16b, XH3.16b
  214. rev64 TT4.16b, TT4.16b
  215. rev64 TT3.16b, TT3.16b
  216. ext IN1.16b, TT4.16b, TT4.16b, #8
  217. ext XL3.16b, TT3.16b, TT3.16b, #8
  218. eor TT4.16b, TT4.16b, IN1.16b
  219. pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
  220. pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
  221. pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
  222. eor TT3.16b, TT3.16b, XL3.16b
  223. pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1
  224. pmull XL3.1q, HH.1d, XL3.1d // a0 * b0
  225. pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
  226. ext IN1.16b, T2.16b, T2.16b, #8
  227. eor XL2.16b, XL2.16b, XL3.16b
  228. eor XH2.16b, XH2.16b, XH3.16b
  229. eor XM2.16b, XM2.16b, XM3.16b
  230. eor T2.16b, T2.16b, IN1.16b
  231. pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1
  232. pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0
  233. pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
  234. eor XL2.16b, XL2.16b, XL3.16b
  235. eor XH2.16b, XH2.16b, XH3.16b
  236. eor XM2.16b, XM2.16b, XM3.16b
  237. ext IN1.16b, T1.16b, T1.16b, #8
  238. ext TT3.16b, XL.16b, XL.16b, #8
  239. eor XL.16b, XL.16b, IN1.16b
  240. eor T1.16b, T1.16b, TT3.16b
  241. pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1
  242. eor T1.16b, T1.16b, XL.16b
  243. pmull XL.1q, HH4.1d, XL.1d // a0 * b0
  244. pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
  245. eor XL.16b, XL.16b, XL2.16b
  246. eor XH.16b, XH.16b, XH2.16b
  247. eor XM.16b, XM.16b, XM2.16b
  248. eor T2.16b, XL.16b, XH.16b
  249. ext T1.16b, XL.16b, XH.16b, #8
  250. eor XM.16b, XM.16b, T2.16b
  251. __pmull_reduce_p64
  252. eor T2.16b, T2.16b, XH.16b
  253. eor XL.16b, XL.16b, T2.16b
  254. cbz w0, 5f
  255. b 1b
  256. .endif
  257. 2: ld1 {T1.2d}, [x2], #16
  258. sub w0, w0, #1
  259. 3: /* multiply XL by SHASH in GF(2^128) */
  260. CPU_LE( rev64 T1.16b, T1.16b )
  261. ext T2.16b, XL.16b, XL.16b, #8
  262. ext IN1.16b, T1.16b, T1.16b, #8
  263. eor T1.16b, T1.16b, T2.16b
  264. eor XL.16b, XL.16b, IN1.16b
  265. __pmull2_\pn XH, XL, SHASH // a1 * b1
  266. eor T1.16b, T1.16b, XL.16b
  267. __pmull_\pn XL, XL, SHASH // a0 * b0
  268. __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)
  269. 4: eor T2.16b, XL.16b, XH.16b
  270. ext T1.16b, XL.16b, XH.16b, #8
  271. eor XM.16b, XM.16b, T2.16b
  272. __pmull_reduce_\pn
  273. eor T2.16b, T2.16b, XH.16b
  274. eor XL.16b, XL.16b, T2.16b
  275. cbnz w0, 0b
  276. 5: st1 {XL.2d}, [x1]
  277. ret
  278. .endm
  279. /*
  280. * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
  281. * struct ghash_key const *k, const char *head)
  282. */
  283. ENTRY(pmull_ghash_update_p64)
  284. __pmull_ghash p64
  285. ENDPROC(pmull_ghash_update_p64)
  286. ENTRY(pmull_ghash_update_p8)
  287. __pmull_ghash p8
  288. ENDPROC(pmull_ghash_update_p8)
  289. KS0 .req v12
  290. KS1 .req v13
  291. INP0 .req v14
  292. INP1 .req v15
  293. .macro load_round_keys, rounds, rk
  294. cmp \rounds, #12
  295. blo 2222f /* 128 bits */
  296. beq 1111f /* 192 bits */
  297. ld1 {v17.4s-v18.4s}, [\rk], #32
  298. 1111: ld1 {v19.4s-v20.4s}, [\rk], #32
  299. 2222: ld1 {v21.4s-v24.4s}, [\rk], #64
  300. ld1 {v25.4s-v28.4s}, [\rk], #64
  301. ld1 {v29.4s-v31.4s}, [\rk]
  302. .endm
  303. .macro enc_round, state, key
  304. aese \state\().16b, \key\().16b
  305. aesmc \state\().16b, \state\().16b
  306. .endm
  307. .macro enc_block, state, rounds
  308. cmp \rounds, #12
  309. b.lo 2222f /* 128 bits */
  310. b.eq 1111f /* 192 bits */
  311. enc_round \state, v17
  312. enc_round \state, v18
  313. 1111: enc_round \state, v19
  314. enc_round \state, v20
  315. 2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
  316. enc_round \state, \key
  317. .endr
  318. aese \state\().16b, v30.16b
  319. eor \state\().16b, \state\().16b, v31.16b
  320. .endm
  321. .macro pmull_gcm_do_crypt, enc
  322. ld1 {SHASH.2d}, [x4], #16
  323. ld1 {HH.2d}, [x4]
  324. ld1 {XL.2d}, [x1]
  325. ldr x8, [x5, #8] // load lower counter
  326. movi MASK.16b, #0xe1
  327. trn1 SHASH2.2d, SHASH.2d, HH.2d
  328. trn2 T1.2d, SHASH.2d, HH.2d
  329. CPU_LE( rev x8, x8 )
  330. shl MASK.2d, MASK.2d, #57
  331. eor SHASH2.16b, SHASH2.16b, T1.16b
  332. .if \enc == 1
  333. ldr x10, [sp]
  334. ld1 {KS0.16b-KS1.16b}, [x10]
  335. .endif
  336. cbnz x6, 4f
  337. 0: ld1 {INP0.16b-INP1.16b}, [x3], #32
  338. rev x9, x8
  339. add x11, x8, #1
  340. add x8, x8, #2
  341. .if \enc == 1
  342. eor INP0.16b, INP0.16b, KS0.16b // encrypt input
  343. eor INP1.16b, INP1.16b, KS1.16b
  344. .endif
  345. ld1 {KS0.8b}, [x5] // load upper counter
  346. rev x11, x11
  347. sub w0, w0, #2
  348. mov KS1.8b, KS0.8b
  349. ins KS0.d[1], x9 // set lower counter
  350. ins KS1.d[1], x11
  351. rev64 T1.16b, INP1.16b
  352. cmp w7, #12
  353. b.ge 2f // AES-192/256?
  354. 1: enc_round KS0, v21
  355. ext IN1.16b, T1.16b, T1.16b, #8
  356. enc_round KS1, v21
  357. pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
  358. enc_round KS0, v22
  359. eor T1.16b, T1.16b, IN1.16b
  360. enc_round KS1, v22
  361. pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
  362. enc_round KS0, v23
  363. pmull XM2.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
  364. enc_round KS1, v23
  365. rev64 T1.16b, INP0.16b
  366. ext T2.16b, XL.16b, XL.16b, #8
  367. enc_round KS0, v24
  368. ext IN1.16b, T1.16b, T1.16b, #8
  369. eor T1.16b, T1.16b, T2.16b
  370. enc_round KS1, v24
  371. eor XL.16b, XL.16b, IN1.16b
  372. enc_round KS0, v25
  373. eor T1.16b, T1.16b, XL.16b
  374. enc_round KS1, v25
  375. pmull2 XH.1q, HH.2d, XL.2d // a1 * b1
  376. enc_round KS0, v26
  377. pmull XL.1q, HH.1d, XL.1d // a0 * b0
  378. enc_round KS1, v26
  379. pmull2 XM.1q, SHASH2.2d, T1.2d // (a1 + a0)(b1 + b0)
  380. enc_round KS0, v27
  381. eor XL.16b, XL.16b, XL2.16b
  382. eor XH.16b, XH.16b, XH2.16b
  383. enc_round KS1, v27
  384. eor XM.16b, XM.16b, XM2.16b
  385. ext T1.16b, XL.16b, XH.16b, #8
  386. enc_round KS0, v28
  387. eor T2.16b, XL.16b, XH.16b
  388. eor XM.16b, XM.16b, T1.16b
  389. enc_round KS1, v28
  390. eor XM.16b, XM.16b, T2.16b
  391. enc_round KS0, v29
  392. pmull T2.1q, XL.1d, MASK.1d
  393. enc_round KS1, v29
  394. mov XH.d[0], XM.d[1]
  395. mov XM.d[1], XL.d[0]
  396. aese KS0.16b, v30.16b
  397. eor XL.16b, XM.16b, T2.16b
  398. aese KS1.16b, v30.16b
  399. ext T2.16b, XL.16b, XL.16b, #8
  400. eor KS0.16b, KS0.16b, v31.16b
  401. pmull XL.1q, XL.1d, MASK.1d
  402. eor T2.16b, T2.16b, XH.16b
  403. eor KS1.16b, KS1.16b, v31.16b
  404. eor XL.16b, XL.16b, T2.16b
  405. .if \enc == 0
  406. eor INP0.16b, INP0.16b, KS0.16b
  407. eor INP1.16b, INP1.16b, KS1.16b
  408. .endif
  409. st1 {INP0.16b-INP1.16b}, [x2], #32
  410. cbnz w0, 0b
  411. CPU_LE( rev x8, x8 )
  412. st1 {XL.2d}, [x1]
  413. str x8, [x5, #8] // store lower counter
  414. .if \enc == 1
  415. st1 {KS0.16b-KS1.16b}, [x10]
  416. .endif
  417. ret
  418. 2: b.eq 3f // AES-192?
  419. enc_round KS0, v17
  420. enc_round KS1, v17
  421. enc_round KS0, v18
  422. enc_round KS1, v18
  423. 3: enc_round KS0, v19
  424. enc_round KS1, v19
  425. enc_round KS0, v20
  426. enc_round KS1, v20
  427. b 1b
  428. 4: load_round_keys w7, x6
  429. b 0b
  430. .endm
  431. /*
  432. * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
  433. * struct ghash_key const *k, u8 ctr[],
  434. * int rounds, u8 ks[])
  435. */
  436. ENTRY(pmull_gcm_encrypt)
  437. pmull_gcm_do_crypt 1
  438. ENDPROC(pmull_gcm_encrypt)
  439. /*
  440. * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
  441. * struct ghash_key const *k, u8 ctr[],
  442. * int rounds)
  443. */
  444. ENTRY(pmull_gcm_decrypt)
  445. pmull_gcm_do_crypt 0
  446. ENDPROC(pmull_gcm_decrypt)
  447. /*
  448. * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
  449. */
  450. ENTRY(pmull_gcm_encrypt_block)
  451. cbz x2, 0f
  452. load_round_keys w3, x2
  453. 0: ld1 {v0.16b}, [x1]
  454. enc_block v0, w3
  455. st1 {v0.16b}, [x0]
  456. ret
  457. ENDPROC(pmull_gcm_encrypt_block)